Milestone 2: Descriptive Statistics¶

Import libraries¶

In [1]:
!pip install pandasql
Requirement already satisfied: pandasql in c:\users\kayan\anaconda3\lib\site-packages (0.7.3)
Requirement already satisfied: pandas in c:\users\kayan\anaconda3\lib\site-packages (from pandasql) (1.4.3)
Requirement already satisfied: numpy in c:\users\kayan\anaconda3\lib\site-packages (from pandasql) (1.21.5)
Requirement already satisfied: sqlalchemy in c:\users\kayan\anaconda3\lib\site-packages (from pandasql) (1.4.39)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\kayan\anaconda3\lib\site-packages (from pandas->pandasql) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\kayan\anaconda3\lib\site-packages (from pandas->pandasql) (2022.1)
Requirement already satisfied: greenlet!=0.4.17 in c:\users\kayan\anaconda3\lib\site-packages (from sqlalchemy->pandasql) (1.1.1)
Requirement already satisfied: six>=1.5 in c:\users\kayan\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas->pandasql) (1.16.0)
In [4]:
import pandas as pd 
import numpy as np
import dash as ds
import plotly.graph_objects as go
import plotly.express as px
from jupyter_dash import JupyterDash
import ast
import pprint
import timeit
from functools import reduce 
from matplotlib import pyplot as plt 
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

#set up the SQL environment
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
In [5]:
# set pandas display optio
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

Load data¶

In [6]:
olympics_games = pd.read_csv(r'C:\Users\kayan\Learning\Coursera - Final Project\sportstats\Milestone1\Data\olympicsgames_joined.csv')
In [7]:
olympics_games.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 18 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   ID       271116 non-null  int64  
 1   Name     271116 non-null  object 
 2   Sex      271116 non-null  object 
 3   Age      261642 non-null  float64
 4   Height   210945 non-null  float64
 5   Weight   207183 non-null  float64
 6   Team     271116 non-null  object 
 7   NOC      271116 non-null  object 
 8   Region   271102 non-null  object 
 9   Games    271116 non-null  object 
 10  Year     271116 non-null  int64  
 11  Season   271116 non-null  object 
 12  Sport    271116 non-null  object 
 13  Event    271116 non-null  object 
 14  City     271116 non-null  object 
 15  Country  271116 non-null  object 
 16  Medal    39783 non-null   object 
 17  Notes    5039 non-null    object 
dtypes: float64(3), int64(2), object(13)
memory usage: 37.2+ MB
In [8]:
# fix column data types 
olympics_games['Age'] = olympics_games['Age'].astype(pd.Int64Dtype())
olympics_games['Height'] = olympics_games['Height'].astype(pd.Int64Dtype())
olympics_games['Weight'] = olympics_games['Weight'].astype(pd.Int64Dtype())
olympics_games['Name'] = olympics_games['Name'].astype(pd.StringDtype())
olympics_games['Sex'] = olympics_games['Sex'].astype(pd.StringDtype())
olympics_games['Team'] = olympics_games['Team'].astype(pd.StringDtype())
olympics_games['NOC'] = olympics_games['NOC'].astype(pd.StringDtype())
olympics_games['Region'] = olympics_games['Region'].astype(pd.StringDtype())
olympics_games['Games'] = olympics_games['Games'].astype(pd.StringDtype())
olympics_games['Season'] = olympics_games['Season'].astype(pd.StringDtype())
olympics_games['Sport'] = olympics_games['Sport'].astype(pd.StringDtype())
olympics_games['Event'] = olympics_games['Event'].astype(pd.StringDtype())
olympics_games['City'] = olympics_games['City'].astype(pd.StringDtype())
olympics_games['Country'] = olympics_games['Country'].astype(pd.StringDtype())
olympics_games['Medal'] = olympics_games['Medal'].astype(pd.StringDtype())
olympics_games['Notes'] = olympics_games['Notes'].astype(pd.StringDtype())
In [9]:
olympics_games.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 18 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   ID       271116 non-null  int64 
 1   Name     271116 non-null  string
 2   Sex      271116 non-null  string
 3   Age      261642 non-null  Int64 
 4   Height   210945 non-null  Int64 
 5   Weight   207183 non-null  Int64 
 6   Team     271116 non-null  string
 7   NOC      271116 non-null  string
 8   Region   271102 non-null  string
 9   Games    271116 non-null  string
 10  Year     271116 non-null  int64 
 11  Season   271116 non-null  string
 12  Sport    271116 non-null  string
 13  Event    271116 non-null  string
 14  City     271116 non-null  string
 15  Country  271116 non-null  string
 16  Medal    39783 non-null   string
 17  Notes    5039 non-null    string
dtypes: Int64(3), int64(2), string(13)
memory usage: 38.0 MB
In [10]:
olympics_games
Out[10]:
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
0 1 A Dijiang M 24 180 80 China CHN China 1992 Summer 1992 Summer Basketball Basketball Men's Basketball Barcelona Spain <NA> <NA>
1 2 A Lamusi M 23 170 60 China CHN China 2012 Summer 2012 Summer Judo Judo Men's Extra-Lightweight London United Kingdom <NA> <NA>
2 3 Gunnar Nielsen Aaby M 24 <NA> <NA> Denmark DEN Denmark 1920 Summer 1920 Summer Football Football Men's Football Antwerpen Belgium <NA> <NA>
3 4 Edgar Lindenau Aabye M 34 <NA> <NA> Denmark/Sweden DEN Denmark 1900 Summer 1900 Summer Tug-Of-War Tug-Of-War Men's Tug-Of-War Paris France Gold <NA>
4 5 Christine Jacoba Aaftink F 21 185 82 Netherlands NED Netherlands 1988 Winter 1988 Winter Speed Skating Speed Skating Women's 500 metres Calgary Canada <NA> <NA>
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
271111 135569 Andrzej ya M 29 179 89 Poland-1 POL Poland 1976 Winter 1976 Winter Luge Luge Mixed (Men)'s Doubles Innsbruck Austria <NA> <NA>
271112 135570 Piotr ya M 27 176 59 Poland POL Poland 2014 Winter 2014 Winter Ski Jumping Ski Jumping Men's Large Hill, Individual Sochi Russia <NA> <NA>
271113 135570 Piotr ya M 27 176 59 Poland POL Poland 2014 Winter 2014 Winter Ski Jumping Ski Jumping Men's Large Hill, Team Sochi Russia <NA> <NA>
271114 135571 Tomasz Ireneusz ya M 30 185 96 Poland POL Poland 1998 Winter 1998 Winter Bobsleigh Bobsleigh Men's Four Nagano Japan <NA> <NA>
271115 135571 Tomasz Ireneusz ya M 34 185 96 Poland POL Poland 2002 Winter 2002 Winter Bobsleigh Bobsleigh Men's Four Salt Lake City United States <NA> <NA>

271116 rows × 18 columns

In [11]:
# data without duplicated rows
olympics_games_distinct = (pysqldf(""" SELECT DISTINCT
                            ID, Name, Sex, Age, Height, Weight, Team, NOC, Region, Games, Year, Season, Sport, Event, City, Country, Medal, Notes
                    FROM olympics_games """))
olympics_games_distinct
Out[11]:
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
0 1 A Dijiang M 24.0 180.0 80.0 China CHN China 1992 Summer 1992 Summer Basketball Basketball Men's Basketball Barcelona Spain None None
1 2 A Lamusi M 23.0 170.0 60.0 China CHN China 2012 Summer 2012 Summer Judo Judo Men's Extra-Lightweight London United Kingdom None None
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark DEN Denmark 1920 Summer 1920 Summer Football Football Men's Football Antwerpen Belgium None None
3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden DEN Denmark 1900 Summer 1900 Summer Tug-Of-War Tug-Of-War Men's Tug-Of-War Paris France Gold None
4 5 Christine Jacoba Aaftink F 21.0 185.0 82.0 Netherlands NED Netherlands 1988 Winter 1988 Winter Speed Skating Speed Skating Women's 500 metres Calgary Canada None None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
269726 135569 Andrzej ya M 29.0 179.0 89.0 Poland-1 POL Poland 1976 Winter 1976 Winter Luge Luge Mixed (Men)'s Doubles Innsbruck Austria None None
269727 135570 Piotr ya M 27.0 176.0 59.0 Poland POL Poland 2014 Winter 2014 Winter Ski Jumping Ski Jumping Men's Large Hill, Individual Sochi Russia None None
269728 135570 Piotr ya M 27.0 176.0 59.0 Poland POL Poland 2014 Winter 2014 Winter Ski Jumping Ski Jumping Men's Large Hill, Team Sochi Russia None None
269729 135571 Tomasz Ireneusz ya M 30.0 185.0 96.0 Poland POL Poland 1998 Winter 1998 Winter Bobsleigh Bobsleigh Men's Four Nagano Japan None None
269730 135571 Tomasz Ireneusz ya M 34.0 185.0 96.0 Poland POL Poland 2002 Winter 2002 Winter Bobsleigh Bobsleigh Men's Four Salt Lake City United States None None

269731 rows × 18 columns

In [12]:
# Determine duplicate rows in original data 
display(pysqldf(""" SELECT DISTINCT
                            ID, Name, Sex, Age, Height, Weight, Team, NOC, Region, Games, Year, Season, Sport, Event, City, Country, Medal, Notes
                    FROM olympics_games
                    GROUP BY ID, Name, Sex, Age, Height, Weight, Team, NOC, Region, Games, Year, Season, Sport, Event, City, Country, Medal, Notes
                    HAVING COUNT(ID) > 1"""))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
0 704 Dsir Antoine Acket M 27.0 NaN NaN Belgium BEL Belgium 1932 Summer 1932 Summer Art Competitions Art Competitions Mixed Painting, Unknown Event Los Angeles United States None None
1 2449 William Truman Aldrich M 48.0 NaN NaN United States USA United States 1928 Summer 1928 Summer Art Competitions Art Competitions Mixed Painting, Drawings And Water Colors Amsterdam Netherlands None None
2 2777 Hermann Reinhard Alker M 43.0 NaN NaN Germany GER Germany 1928 Summer 1928 Summer Art Competitions Art Competitions Mixed Architecture, Architectural Designs Amsterdam Netherlands None None
3 2777 Hermann Reinhard Alker M 43.0 NaN NaN Germany GER Germany 1928 Summer 1928 Summer Art Competitions Art Competitions Mixed Architecture, Designs For Town Planning Amsterdam Netherlands None None
4 2777 Hermann Reinhard Alker M 51.0 NaN NaN Germany GER Germany 1936 Summer 1936 Summer Art Competitions Art Competitions Mixed Architecture, Unknown Event Berlin Germany None None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
607 134046 ngel Zrraga Argelles M 41.0 NaN NaN Mexico MEX Mexico 1928 Summer 1928 Summer Art Competitions Art Competitions Mixed Painting, Paintings Amsterdam Netherlands None None
608 135072 Anna Katrina Zinkeisen (-Heseltine) F 46.0 NaN NaN Great Britain GBR United Kingdom 1948 Summer 1948 Summer Art Competitions Art Competitions Mixed Painting, Paintings London United Kingdom None None
609 135072 Anna Katrina Zinkeisen (-Heseltine) F 46.0 NaN NaN Great Britain GBR United Kingdom 1948 Summer 1948 Summer Art Competitions Art Competitions Mixed Painting, Unknown Event London United Kingdom None None
610 135073 Doris Clare Zinkeisen (-Johnstone) F 49.0 NaN NaN Great Britain GBR United Kingdom 1948 Summer 1948 Summer Art Competitions Art Competitions Mixed Painting, Unknown Event London United Kingdom None None
611 135173 Henri Achille Zo M 58.0 NaN NaN France FRA France 1932 Summer 1932 Summer Art Competitions Art Competitions Mixed Painting, Unknown Event Los Angeles United States None None

612 rows × 18 columns

In [13]:
display(pysqldf(""" SELECT * 
                    FROM olympics_games
                    WHERE ID = 2449"""))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
0 2449 William Truman Aldrich M 48 None None United States USA United States 1928 Summer 1928 Summer Art Competitions Art Competitions Mixed Painting, Drawings And Water Colors Amsterdam Netherlands None None
1 2449 William Truman Aldrich M 48 None None United States USA United States 1928 Summer 1928 Summer Art Competitions Art Competitions Mixed Painting, Drawings And Water Colors Amsterdam Netherlands None None
2 2449 William Truman Aldrich M 48 None None United States USA United States 1928 Summer 1928 Summer Art Competitions Art Competitions Mixed Painting, Drawings And Water Colors Amsterdam Netherlands None None

Descriptive Statistics¶

Using SQL¶

Count¶

In [14]:
# total number of observations in the data
display(pysqldf("SELECT COUNT(*)  FROM olympics_games; "))
COUNT(*)
0 271116
In [15]:
# total number of observations in the data without duplicated rows 
display(pysqldf("SELECT COUNT(*)  FROM olympics_games_distinct; "))
COUNT(*)
0 269731
In [16]:
# count null values in the data
display(pysqldf("""SELECT 
                        COUNT(*)-COUNT(ID) As ID, 
                        COUNT(*)-COUNT(Name) As Name, 
                        COUNT(*)-COUNT(Sex) As Sex, 
                        COUNT(*)-COUNT(Age) As Age, 
                        COUNT(*)-COUNT(Height) As Height, 
                        COUNT(*)-COUNT(Weight) As Weight, 
                        COUNT(*)-COUNT(Team) As Team,
                        COUNT(*)-COUNT(NOC) As NOC,
                        COUNT(*)-COUNT(Region) As Region,
                        COUNT(*)-COUNT(Games) As Games,
                        COUNT(*)-COUNT(Year) As Year,
                        COUNT(*)-COUNT(Season) As Season,
                        COUNT(*)-COUNT(Sport) As Sport,
                        COUNT(*)-COUNT(Event) As Event,
                        COUNT(*)-COUNT(City) As City,
                        COUNT(*)-COUNT(Country) As Country,
                        COUNT(*)-COUNT(Medal) As Medal,
                        COUNT(*)-COUNT(Notes) As Notes
                    FROM olympics_games_distinct;  """))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
0 0 0 0 9315 58814 62585 0 0 14 0 0 0 0 0 0 0 229959 264696
In [17]:
# null regions (Refugee and Unknown teams)
display(pysqldf("SELECT * FROM olympics_games_distinct WHERE Region IS NULL; "))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
0 3515 Paulo Amotun Lokoro M 24.0 170.0 61.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Athletics Athletics Men's 1,500 metres Rio de Janeiro Brazil None Refugee Olympic Team
1 4379 Rami Anis M 25.0 178.0 78.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Swimming Swimming Men's 100 metres Freestyle Rio de Janeiro Brazil None Refugee Olympic Team
2 4379 Rami Anis M 25.0 178.0 78.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Swimming Swimming Men's 100 metres Butterfly Rio de Janeiro Brazil None Refugee Olympic Team
3 11364 Yiech Pur Biel M 21.0 178.0 62.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Athletics Athletics Men's 800 metres Rio de Janeiro Brazil None Refugee Olympic Team
4 16287 Mabika Yolande Bukasa F 28.0 170.0 70.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Judo Judo Women's Middleweight Rio de Janeiro Brazil None Refugee Olympic Team
5 20693 James Nyang Chiengjiek M 24.0 179.0 59.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Athletics Athletics Men's 400 metres Rio de Janeiro Brazil None Refugee Olympic Team
6 31292 Fritz Eccard M NaN NaN NaN Unknown UNK None 1912 Summer 1912 Summer Art Competitions Art Competitions Mixed Architecture Stockholm Sweden None Unknown
7 80325 Popole Misenga M 24.0 180.0 90.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Judo Judo Men's Middleweight Rio de Janeiro Brazil None Refugee Olympic Team
8 60444 Yonas Kinde M 36.0 172.0 57.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Athletics Athletics Men's Marathon Rio de Janeiro Brazil None Refugee Olympic Team
9 65813 A. Laffen M NaN NaN NaN Unknown UNK None 1912 Summer 1912 Summer Art Competitions Art Competitions Mixed Architecture Stockholm Sweden None Unknown
10 71067 Anjelina Nadai Lohalith F 23.0 163.0 50.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Athletics Athletics Women's 1,500 metres Rio de Janeiro Brazil None Refugee Olympic Team
11 71113 Rose Nathike Lokonyen F 21.0 157.0 50.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Athletics Athletics Women's 800 metres Rio de Janeiro Brazil None Refugee Olympic Team
12 74902 Yusra Mardini F 18.0 157.0 53.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Swimming Swimming Women's 100 metres Freestyle Rio de Janeiro Brazil None Refugee Olympic Team
13 74902 Yusra Mardini F 18.0 157.0 53.0 Refugee Olympic Athletes ROT None 2016 Summer 2016 Summer Swimming Swimming Women's 100 metres Butterfly Rio de Janeiro Brazil None Refugee Olympic Team
In [18]:
# count number of distinct values in the data
display(pysqldf(""" SELECT
                        COUNT(DISTINCT ID) AS IDs, 
                        COUNT(DISTINCT Name) AS Names,
                        COUNT(DISTINCT Sex) AS Sex, 
                        COUNT(DISTINCT Team) AS Teams, 
                        COUNT(DISTINCT NOC) AS NOCs, 
                        COUNT(DISTINCT Region) AS Regions, 
                        COUNT(DISTINCT Sport) AS Sports, 
                        COUNT(DISTINCT Event) AS Events, 
                        COUNT(DISTINCT Games) AS Games, 
                        COUNT(DISTINCT City) AS Cities, 
                        COUNT(DISTINCT Country) AS Countries, 
                        COUNT(DISTINCT Year) AS Years, 
                        COUNT(DISTINCT Season) AS Season,
                        COUNT(DISTINCT Medal) AS Medals
                    FROM olympics_games_distinct; """))
IDs Names Sex Teams NOCs Regions Sports Events Games Cities Countries Years Season Medals
0 135571 134732 2 1184 230 207 66 765 51 42 23 35 2 3
In [19]:
%%time
# Total number of medals per gender  
display(pysqldf(""" SELECT
                        Sex, 
                        COUNT(Medal) AS total_medal
                    FROM olympics_games_distinct
                    GROUP BY Sex; """))
Sex total_medal
0 F 11253
1 M 28519
CPU times: total: 4.39 s
Wall time: 4.39 s
In [20]:
%%time
# Total number of medals per gender using window funtion 
display(pysqldf(""" SELECT
                        DISTINCT Sex, 
                        COUNT(Medal) OVER(PARTITION BY Sex) AS total_medal
                    FROM olympics_games_distinct; """))
Sex total_medal
0 F 11253
1 M 28519
CPU times: total: 4.88 s
Wall time: 4.89 s
In [21]:
%%time
# Total number of gold, silver, bronze medals per sex
display(pysqldf(""" SELECT
                        Sex, 
                        Medal, 
                        COUNT(Medal) AS total_medal
                    FROM olympics_games_distinct
                    WHERE Medal IS NOT NULL
                    GROUP BY Sex, Medal; """))
Sex Medal total_medal
0 F Bronze 3771
1 F Gold 3747
2 F Silver 3735
3 M Bronze 9524
4 M Gold 9622
5 M Silver 9373
CPU times: total: 4.39 s
Wall time: 4.38 s
In [22]:
%%time
# Total number of gold, silver, bronze medals per sex using window function 
display(pysqldf(""" SELECT
                        DISTINCT Sex, MedaL, 
                        COUNT(Medal) OVER(PARTITION BY Sex, Medal)AS total_medal
                    FROM olympics_games_distinct
                    WHERE Medal IS NOT NULL; """))
Sex Medal total_medal
0 F Bronze 3771
1 F Gold 3747
2 F Silver 3735
3 M Bronze 9524
4 M Gold 9622
5 M Silver 9373
CPU times: total: 4.69 s
Wall time: 4.69 s
In [23]:
# count how many distinct IDs each name belongs to
display(pysqldf(""" SELECT Name, COUNT(DISTINCT ID) AS id_count 
                    FROM olympics_games_distinct
                    GROUP BY Name
                    ORDER BY id_count DESC  """))
Name id_count
0 Zhang Li 5
1 Wolfgang Mller 5
2 Wang Nan 5
3 Lszl Szab 5
4 Li Jie 5
... ... ...
134727 Th Anh 1
134728 Luis ngel Fernando de los Santos Grossi 1
134729 Jean Hauptmanns 1
134730 Eleonora Margarida Josephina Scmitt 1
134731 Gabrielle Marie "Gabby" Adcock (White-) 1

134732 rows × 2 columns

In [24]:
%%time
#Gender distribution in sports
gender_per_sport = pysqldf(""" SELECT 
                        Sport, Sex, COUNT(DISTINCT ID) as gender_count
                    FROM olympics_games_distinct
                    GROUP BY Sport, Sex
                    /*ORDER BY gender_count DESC*/""")

gender_per_sport
CPU times: total: 4.84 s
Wall time: 4.85 s
Out[24]:
Sport Sex gender_count
0 Aeronautics M 1
1 Alpine Skiing F 996
2 Alpine Skiing M 1739
3 Alpinism F 1
4 Alpinism M 24
... ... ... ...
111 Water Polo M 2262
112 Weightlifting F 356
113 Weightlifting M 2526
114 Wrestling F 222
115 Wrestling M 4766

116 rows × 3 columns

In [25]:
#Gender distribution in sports
gender_per_sport = pysqldf(""" SELECT  
                                      Sport, 
                                      COUNT(CASE WHEN Sex = 'M' THEN 1 END) AS male_count, 
                                      COUNT(CASE WHEN Sex = 'F' THEN 1 END) AS female_count
                                FROM(
                                        SELECT DISTINCT ID, Name, Sex, Sport 
                                        FROM olympics_games_distinct
                                        )
                                GROUP BY Sport  """)

gender_per_sport
Out[25]:
Sport male_count female_count
0 Aeronautics 1 0
1 Alpine Skiing 1739 996
2 Alpinism 24 1
3 Archery 613 500
4 Art Competitions 1610 204
5 Athletics 15542 6529
6 Badminton 399 412
7 Baseball 761 0
8 Basketball 2481 932
9 Basque Pelota 2 0
10 Beach Volleyball 194 189
11 Biathlon 764 371
12 Bobsleigh 1585 109
13 Boxing 5197 65
14 Canoeing 2504 702
15 Cricket 24 0
16 Croquet 7 3
17 Cross Country Skiing 1683 717
18 Curling 186 160
19 Cycling 5105 714
20 Diving 831 635
21 Equestrianism 1886 459
22 Fencing 3243 880
23 Figure Skating 748 824
24 Football 5427 734
25 Freestyle Skiing 359 267
26 Golf 148 70
27 Gymnastics 2635 1499
28 Handball 1675 1027
29 Hockey 2829 996
30 Ice Hockey 3386 498
31 Jeu De Paume 11 0
32 Judo 1967 757
33 Lacrosse 60 0
34 Luge 544 228
35 Military Ski Patrol 24 0
36 Modern Pentathlon 750 114
37 Motorboating 13 1
38 Nordic Combined 605 0
39 Polo 87 0
40 Racquets 7 0
41 Rhythmic Gymnastics 0 567
42 Roque 4 0
43 Rowing 6204 1483
44 Rugby 155 0
45 Rugby Sevens 151 148
46 Sailing 3851 629
47 Shooting 4145 737
48 Short Track Speed Skating 235 209
49 Skeleton 101 45
50 Ski Jumping 844 30
51 Snowboarding 328 239
52 Softball 0 367
53 Speed Skating 1054 528
54 Swimming 5144 3621
55 Synchronized Swimming 0 550
56 Table Tennis 372 377
57 Taekwondo 241 229
58 Tennis 760 486
59 Trampolining 49 44
60 Triathlon 180 175
61 Tug-Of-War 160 0
62 Volleyball 1374 1129
63 Water Polo 2262 337
64 Weightlifting 2526 356
65 Wrestling 4766 222
Home vs. Away teams medals¶
In [26]:
#Count number of home and away teams' medals in each game 
medals_count_df = pysqldf(""" 
                    SELECT Games, 
                           TeamType, 
                           COUNT(Medal) AS medal_count
                    FROM (  SELECT DISTINCT ID, 
                                    Name, 
                                    Games,
                                    Event,
                                    Sport, 
                                    Team, 
                                    Region, 
                                    Country, 
                                    Medal,
                                    (CASE WHEN Region != Country THEN 'away' 
                                          WHEN Region = Country THEN 'home'
                                          END) AS TeamType
                            FROM olympics_games_distinct
                            WHERE Medal IS NOT NULL)
                    GROUP BY Games, TeamType
                """)
medals_count_df
Out[26]:
Games TeamType medal_count
0 1896 Summer away 95
1 1896 Summer home 48
2 1900 Summer away 368
3 1900 Summer home 225
4 1904 Summer away 92
... ... ... ...
97 2012 Summer home 126
98 2014 Winter away 529
99 2014 Winter home 68
100 2016 Summer away 1973
101 2016 Summer home 50

102 rows × 3 columns

In [27]:
# number of teams in every game 
display(pysqldf(""" SELECT Games, COUNT(DISTINCT Team) as num_teams 
                    FROM olympics_games_distinct
                    GROUP BY Games 
                    ;"""))
Games num_teams
0 1896 Summer 18
1 1900 Summer 190
2 1904 Summer 79
3 1906 Summer 52
4 1908 Summer 73
5 1912 Summer 102
6 1920 Summer 72
7 1924 Summer 90
8 1924 Winter 28
9 1928 Summer 67
10 1928 Winter 41
11 1932 Summer 59
12 1932 Winter 29
13 1936 Summer 105
14 1936 Winter 54
15 1948 Summer 111
16 1948 Winter 46
17 1952 Summer 133
18 1952 Winter 52
19 1956 Summer 122
20 1956 Winter 56
21 1960 Summer 186
22 1960 Winter 40
23 1964 Summer 168
24 1964 Winter 66
25 1968 Summer 112
26 1968 Winter 70
27 1972 Summer 139
28 1972 Winter 63
29 1976 Summer 92
30 1976 Winter 70
31 1980 Summer 80
32 1980 Winter 58
33 1984 Summer 140
34 1984 Winter 80
35 1988 Summer 177
36 1988 Winter 98
37 1992 Summer 212
38 1992 Winter 111
39 1994 Winter 101
40 1996 Summer 246
41 1998 Winter 106
42 2000 Summer 243
43 2002 Winter 114
44 2004 Summer 260
45 2006 Winter 113
46 2008 Summer 292
47 2010 Winter 116
48 2012 Summer 245
49 2014 Winter 119
50 2016 Summer 249
In [28]:
# number of home and away teams in every game
homeaway_teamcnt_df = pysqldf("""  WITH hometeams AS( SELECT Games, COUNT(DISTINCT Team) AS home_team_cnt 
                                                      FROM olympics_games_distinct
                                                      WHERE Region=Country
                                                      GROUP BY Games)
                                    SELECT *
                                    FROM hometeams
                                    LEFT JOIN( SELECT Games, COUNT(DISTINCT Team) AS away_team_cnt 
                                               FROM olympics_games_distinct
                                               WHERE Region!=Country
                                               GROUP BY Games)
                                    USING(Games)
                                """)
homeaway_teamcnt_df
Out[28]:
Games home_team_cnt away_team_cnt
0 1896 Summer 5 13
1 1900 Summer 122 77
2 1904 Summer 59 27
3 1906 Summer 21 33
4 1908 Summer 34 39
5 1912 Summer 17 84
6 1920 Summer 9 63
7 1924 Summer 5 85
8 1924 Winter 3 25
9 1928 Summer 3 64
10 1928 Winter 3 38
11 1932 Summer 4 55
12 1932 Winter 3 26
13 1936 Summer 5 100
14 1936 Winter 3 51
15 1948 Summer 5 106
16 1948 Winter 3 43
17 1952 Summer 5 128
18 1952 Winter 3 49
19 1956 Summer 6 118
20 1956 Winter 3 53
21 1960 Summer 5 181
22 1960 Winter 4 36
23 1964 Summer 5 163
24 1964 Winter 3 63
25 1968 Summer 1 111
26 1968 Winter 3 67
27 1972 Summer 8 131
28 1972 Winter 3 60
29 1976 Summer 1 91
30 1976 Winter 3 67
31 1980 Summer 1 79
32 1980 Winter 3 55
33 1984 Summer 1 139
34 1984 Winter 3 77
35 1988 Summer 3 174
36 1988 Winter 4 94
37 1992 Summer 1 211
38 1992 Winter 4 107
39 1994 Winter 1 100
40 1996 Summer 4 242
41 1998 Winter 3 103
42 2000 Summer 4 239
43 2002 Winter 3 111
44 2004 Summer 3 257
45 2006 Winter 3 110
46 2008 Summer 5 287
47 2010 Winter 3 113
48 2012 Summer 3 242
49 2014 Winter 4 115
50 2016 Summer 3 245
In [29]:
# only 1 home team in 1980 summer?
display(pysqldf(""" SELECT DISTINCT Team
                    FROM olympics_games_distinct
                    WHERE Games = '1980 Summer'  AND (Region = Country) """))
Team
0 Soviet Union
In [30]:
# home teams in 1896 Summer 
display(pysqldf(""" SELECT DISTINCT Team
                    FROM olympics_games_distinct
                    WHERE Games = '1896 Summer'  AND (Region = Country)
                   """))
Team
0 Greece
1 Greece-3
2 Greece-1
3 Greece-2
4 Ethnikos Gymnastikos Syllogos
In [31]:
# verify results above 
display(pysqldf(""" SELECT DISTINCT Games, Team, Region, Country
                    FROM olympics_games_distinct
                    WHERE Games = '1904 Summer' AND Region = Country ; """))
Games Team Region Country
0 1904 Summer Vesper Boat Club United States United States
1 1904 Summer United States United States United States
2 1904 Summer New York Athletic Club #2-4 United States United States
3 1904 Summer Mound City Rowing Club-2 United States United States
4 1904 Summer La Salle Turnverein, Chicago United States United States
5 1904 Summer Christian Brothers' College-1 United States United States
6 1904 Summer Passaic Turnverein, Passaic United States United States
7 1904 Summer Chicago Athletic Association-2 United States United States
8 1904 Summer United States-2 United States United States
9 1904 Summer Davenport Turngemeinde, Davenport United States United States
10 1904 Summer Norwegier Turnverein, Brooklyn United States United States
11 1904 Summer Turnverein Vorwrts, Cleveland United States United States
12 1904 Summer New York Turnverein, New York United States United States
13 1904 Summer United States-10 United States United States
14 1904 Summer St. Rose-2 United States United States
15 1904 Summer New York Athletic Club-1 United States United States
16 1904 Summer St. Louis Southwest Turnverein #1-2 United States United States
17 1904 Summer Chicago Archers United States United States
18 1904 Summer Boston Archers United States United States
19 1904 Summer Western Rowing Club-3 United States United States
20 1904 Summer United States Golf Association-3 United States United States
21 1904 Summer Trans-Mississippi Golf Association-2 United States United States
22 1904 Summer New York Athletic Club-4 United States United States
23 1904 Summer United States-9 United States United States
24 1904 Summer Cincinnati Archers United States United States
25 1904 Summer United States-6 United States United States
26 1904 Summer Western Golf Association-1 United States United States
27 1904 Summer New York Athletic Club #1-1 United States United States
28 1904 Summer United States-7 United States United States
29 1904 Summer Concordia Turnverein, St Louis United States United States
30 1904 Summer Century Boat Club-1 United States United States
31 1904 Summer St. Louis Amateur Athletic Association United States United States
32 1904 Summer United States-14 United States United States
33 1904 Summer Central Turnverein, Chicago United States United States
34 1904 Summer Turnverein Vorwrts, Chicago United States United States
35 1904 Summer Missouri Athletic Club-3 United States United States
36 1904 Summer Seawanhaka Boat Club-1 United States United States
37 1904 Summer Milwaukee Athletic Club-1 United States United States
38 1904 Summer St. Louis Southwest Turnverein #2-3 United States United States
39 1904 Summer Socialer Turnverein, Detroit United States United States
40 1904 Summer Potomac Archers United States United States
41 1904 Summer Ravenswood Boat Club-2 United States United States
42 1904 Summer United States-11 United States United States
43 1904 Summer Atalanta Boat Club-1 United States United States
44 1904 Summer Atalanta Boat Club-2 United States United States
45 1904 Summer New York Athletic Club United States United States
46 1904 Summer South St Louis Turnverein, St Louis United States United States
47 1904 Summer Milwaukee Turnverein, Milwaukee United States United States
48 1904 Summer Independent Rowing Club-3 United States United States
49 1904 Summer Philadelphia Turngemeinde, Philadelphia United States United States
50 1904 Summer United States-13 United States United States
51 1904 Summer United States-12 United States United States
52 1904 Summer United States-4 United States United States
53 1904 Summer Germany/United States United States United States
54 1904 Summer Chicago Athletic Association United States United States
55 1904 Summer United States-5 United States United States
56 1904 Summer United States-8 United States United States
57 1904 Summer United States-3 United States United States
58 1904 Summer United States-1 United States United States
In [32]:
#check why there are no home teams in 1984 winter (Yugoslavia is now Serbia) (yusgoslavia country changed to serbia in databricks workspace)
#display(pysqldf(""" SELECT DISTINCT Games, Team, Region, Country
    #                FROM olympics_games_distinct
   #                 WHERE Games = '1984 Winter' /*AND Region = 'UK'*/; """))
In [33]:
#Count number of home and away teams' medals in each game 
medals_count_df = pysqldf(""" 
                    SELECT Games, 
                           AwayHome, 
                           COUNT(Medal) AS medal_count 
                    FROM (  SELECT DISTINCT ID, 
                                    Name, 
                                    Games,
                                    Event,
                                    Sport, 
                                    Team, 
                                    Region, 
                                    Country, 
                                    Medal,
                                    (CASE WHEN Region != Country THEN 'away' 
                                          WHEN Region = Country THEN 'home'
                                          END) AS AwayHome
                            FROM olympics_games_distinct )
                    GROUP BY Games, AwayHome
                """)
medals_count_df
Out[33]:
Games AwayHome medal_count
0 1896 Summer away 95
1 1896 Summer home 48
2 1900 Summer away 368
3 1900 Summer home 225
4 1904 Summer away 92
... ... ... ...
99 2014 Winter away 529
100 2014 Winter home 68
101 2016 Summer None 0
102 2016 Summer away 1973
103 2016 Summer home 50

104 rows × 3 columns

In [34]:
# Count total number of away teams' medals and home teams medals
display(pysqldf(""" SELECT AwayHome, 
                           SUM(medal_count) as total_medal_count  
                    FROM medals_count_df
                    GROUP BY AwayHome """))
AwayHome total_medal_count
0 None 0
1 away 34756
2 home 5016
In [35]:
home_medal_df = pysqldf(""" SELECT Games, medal_count AS home_medal_count 
                            FROM medals_count_df 
                            WHERE AwayHome = 'home'
                            """)
away_medal_df = pysqldf(""" SELECT Games, medal_count AS away_medal_count 
                            FROM medals_count_df 
                            WHERE AwayHome = 'away'
                            """)
In [36]:
# put home and away team medal count into separate columns 
homeaway_df = pysqldf(""" SELECT * 
                          FROM home_medal_df 
                          LEFT JOIN away_medal_df
                          USING(Games)
                            """)
In [37]:
# join total number of home/away medals and teams 
homeaway_teammedal_df = pysqldf("""   SELECT * 
                                      FROM homeaway_df 
                                      LEFT JOIN homeaway_teamcnt_df
                                      USING(Games) """)
homeaway_teammedal_df
Out[37]:
Games home_medal_count away_medal_count home_team_cnt away_team_cnt
0 1896 Summer 48 95 5 13
1 1900 Summer 225 368 122 77
2 1904 Summer 394 92 59 27
3 1906 Summer 102 356 21 33
4 1908 Summer 368 463 34 39
5 1912 Summer 190 751 17 84
6 1920 Summer 188 1120 9 63
7 1924 Summer 110 722 5 85
8 1924 Winter 10 120 3 25
9 1928 Summer 57 677 3 64
10 1928 Winter 12 77 3 38
11 1932 Summer 189 458 4 55
12 1932 Winter 34 58 3 26
13 1936 Summer 224 693 5 100
14 1936 Winter 7 101 3 51
15 1948 Summer 61 791 5 106
16 1948 Winter 28 107 3 43
17 1952 Summer 40 857 5 128
18 1952 Winter 19 117 3 49
19 1956 Summer 72 821 6 118
20 1956 Winter 8 142 3 53
21 1960 Summer 88 823 5 181
22 1960 Winter 27 120 4 36
23 1964 Summer 62 967 5 163
24 1964 Winter 17 169 3 63
25 1968 Summer 9 1048 1 111
26 1968 Winter 9 190 3 67
27 1972 Summer 253 962 8 131
28 1972 Winter 3 196 3 60
29 1976 Summer 23 1297 1 91
30 1976 Winter 7 204 3 67
31 1980 Summer 442 942 1 79
32 1980 Winter 30 188 3 55
33 1984 Summer 352 1124 1 139
34 1984 Winter 1 221 3 77
35 1988 Summer 77 1505 3 174
36 1988 Winter 6 257 4 94
37 1992 Summer 69 1643 1 211
38 1992 Winter 12 306 4 107
39 1994 Winter 30 301 1 100
40 1996 Summer 259 1583 4 242
41 1998 Winter 13 427 3 103
42 2000 Summer 183 1821 4 239
43 2002 Winter 84 394 3 111
44 2004 Summer 31 1970 3 257
45 2006 Winter 25 501 3 110
46 2008 Summer 184 1864 5 287
47 2010 Winter 90 430 3 113
48 2012 Summer 126 1815 3 242
49 2014 Winter 68 529 4 115
50 2016 Summer 50 1973 3 245

Mean, Median, Mode, Min, Max, IQR¶

In [38]:
# function to compute descriptive stats 
def descriptive_stats(table_name, table_col): 
    print("**************************************************")
    print(f"Table Name: {table_name}")
    print(f"Table Column: {table_col}")
    print("**************************************************")

    print("")

    # Count
    cnt = pysqldf(f"""SELECT 
                            "COUNT" AS Stat, 
                            COUNT({table_col}) AS Value 
                      FROM {table_name}; """)

    # Mean 
    mean = pysqldf(f"""SELECT 
                            "MEAN" AS Stat, 
                            CAST(AVG({table_col}) AS INT) AS Value 
                        FROM {table_name};""")
    # Median 
    median = pysqldf(f"""SELECT 
                            "MEDIAN" AS Stat,
                            {table_col} AS Value 
                        FROM {table_name}
                        ORDER BY {table_col}
                        LIMIT 1
                        OFFSET (SELECT COUNT(*) FROM {table_name}) / 2 ;""")
    # Mode 
    mode = pysqldf(f"""SELECT 
                            "MODE" AS Stat,
                            {table_col} AS Value,
                            COUNT(*) AS Count
                        FROM {table_name}
                        WHERE {table_col} IS NOT NULL
                        GROUP BY {table_col}
                        ORDER BY COUNT(*) DESC
                        LIMIT 1""")
    # Min and Max 
    min = pysqldf(f"""SELECT 
                            "MIN" AS Stat, 
                            MIN({table_col}) AS Value 
                      FROM {table_name}; """)
    max = pysqldf(f"""SELECT 
                            "MAX" AS Stat, 
                            MAX({table_col}) AS Value 
                      FROM {table_name}; """)
    
    display(pd.concat([cnt, mean, median, min, max]).set_index("Stat"))
    display(mode.set_index('Stat'))
    print("")
In [39]:
descriptive_stats('olympics_games_distinct', 'olympics_games_distinct.Height')
**************************************************
Table Name: olympics_games_distinct
Table Column: olympics_games_distinct.Height
**************************************************

Value
Stat
COUNT 210917.0
MEAN 175.0
MEDIAN 171.0
MIN 127.0
MAX 226.0
Value Count
Stat
MODE 180.0 12492

In [40]:
descriptive_stats('olympics_games_distinct', 'olympics_games_distinct.Age')
**************************************************
Table Name: olympics_games_distinct
Table Column: olympics_games_distinct.Age
**************************************************

Value
Stat
COUNT 260416.0
MEAN 25.0
MEDIAN 24.0
MIN 10.0
MAX 97.0
Value Count
Stat
MODE 23.0 21863

In [41]:
descriptive_stats('olympics_games_distinct', 'olympics_games_distinct.Weight')
**************************************************
Table Name: olympics_games_distinct
Table Column: olympics_games_distinct.Weight
**************************************************

Value
Stat
COUNT 207146.0
MEAN 70.0
MEDIAN 64.0
MIN 25.0
MAX 214.0
Value Count
Stat
MODE 70.0 9625

In [42]:
# For 25-50-75-100 quartile range in SQL 

def get_iqrs(table_name, table_col):
    query = f"""WITH percent_tbl AS (SELECT 
                                            {table_col}, 
                                            ntile(4) OVER (ORDER BY {table_col}) percent
                                            FROM {table_name}), 
                     p_25 AS (SELECT 
                                    (percent * 0.25) as pct, 
                                    last_value({table_col}) OVER (PARTITION BY percent) last_val
                               FROM percent_tbl
                               WHERE percent =  1 
                               LIMIT 1), 
                     p_50 AS (SELECT 
                                    (percent * 0.25) as pct, 
                                    last_value({table_col}) OVER (PARTITION BY percent) last_val
                               FROM percent_tbl
                               WHERE percent =  2 
                               LIMIT 1),  
                     p_75 AS (SELECT 
                                    (percent * 0.25) as pct, 
                                    last_value({table_col}) OVER (PARTITION BY percent) last_val
                               FROM percent_tbl
                               WHERE percent =  3 
                               LIMIT 1), 
                     p_100 AS (SELECT 
                                    (percent * 0.25) as pct, 
                                    last_value({table_col}) OVER (PARTITION BY percent) last_val
                               FROM percent_tbl
                               WHERE percent =  4 
                               LIMIT 1)
                     SELECT * FROM p_25 UNION
                     SELECT * FROM p_50 UNION
                     SELECT * FROM p_75 UNION
                     SELECT * FROM p_100;"""
    iqrs = pysqldf(query)
    return display(pd.DataFrame(iqrs))
In [43]:
get_iqrs('olympics_games_distinct', 'Age')
pct last_val
0 0.25 21.0
1 0.50 24.0
2 0.75 28.0
3 1.00 97.0
In [44]:
get_iqrs('olympics_games_distinct', 'Height')
pct last_val
0 0.25 157.0
1 0.50 171.0
2 0.75 180.0
3 1.00 226.0
In [45]:
get_iqrs('olympics_games_distinct', 'Weight')
pct last_val
0 0.25 47.0
1 0.50 64.0
2 0.75 75.0
3 1.00 214.0
In [46]:
descriptive_stats('olympics_games_distinct', 'olympics_games_distinct.ID')
**************************************************
Table Name: olympics_games_distinct
Table Column: olympics_games_distinct.ID
**************************************************

Value
Stat
COUNT 269731
MEAN 68264
MEDIAN 68233
MIN 1
MAX 135571
Value Count
Stat
MODE 106296 39

In [47]:
# ID with the most number of observations in the data
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Games) as num_games 
                    FROM olympics_games_distinct 
                    WHERE ID = 106296; """))
ID Name num_games
0 106296 Heikki Ilmari Savolainen 5
In [48]:
# Most common ID/Athlete observations in the data based on the number of events they participated in 
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Event) as num_events
                    FROM olympics_games_distinct 
                    WHERE ID = 106296; """))
ID Name num_events
0 106296 Heikki Ilmari Savolainen 8
In [49]:
# Most common ID/Athlete observations in the data 
display(pysqldf(""" SELECT ID, Name, COUNT(*) as num_of_observations 
                    FROM olympics_games_distinct 
                    GROUP BY ID, Name
                    ORDER BY num_of_observations DESC; """))
ID Name num_of_observations
0 106296 Heikki Ilmari Savolainen 39
1 115354 Joseph "Josy" Stoffel 38
2 119591 Ioannis Theofilakis 36
3 89187 Takashi Ono 32
4 119590 Alexandros Theofilakis 32
... ... ... ...
135566 135562 Milan Zyka 1
135567 135564 Yevgeny Aleksandrovich Zykov 1
135568 135566 James Francis "Jim" Zylker 1
135569 135568 Olga Igorevna Zyuzkova 1
135570 135569 Andrzej ya 1

135571 rows × 3 columns

In [50]:
display(pysqldf(""" SELECT *
                    FROM olympics_games_distinct
                    WHERE Name = 'Heikki Ilmari Savolainen'; """))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
0 106296 Heikki Ilmari Savolainen M 20.0 172.0 64.0 Finland FIN Finland 1928 Summer 1928 Summer Gymnastics Gymnastics Men's Individual All-Around Amsterdam Netherlands None None
1 106296 Heikki Ilmari Savolainen M 20.0 172.0 64.0 Finland FIN Finland 1928 Summer 1928 Summer Gymnastics Gymnastics Men's Team All-Around Amsterdam Netherlands None None
2 106296 Heikki Ilmari Savolainen M 20.0 172.0 64.0 Finland FIN Finland 1928 Summer 1928 Summer Gymnastics Gymnastics Men's Horse Vault Amsterdam Netherlands None None
3 106296 Heikki Ilmari Savolainen M 20.0 172.0 64.0 Finland FIN Finland 1928 Summer 1928 Summer Gymnastics Gymnastics Men's Parallel Bars Amsterdam Netherlands None None
4 106296 Heikki Ilmari Savolainen M 20.0 172.0 64.0 Finland FIN Finland 1928 Summer 1928 Summer Gymnastics Gymnastics Men's Horizontal Bar Amsterdam Netherlands None None
5 106296 Heikki Ilmari Savolainen M 20.0 172.0 64.0 Finland FIN Finland 1928 Summer 1928 Summer Gymnastics Gymnastics Men's Rings Amsterdam Netherlands None None
6 106296 Heikki Ilmari Savolainen M 20.0 172.0 64.0 Finland FIN Finland 1928 Summer 1928 Summer Gymnastics Gymnastics Men's Pommelled Horse Amsterdam Netherlands Bronze None
7 106296 Heikki Ilmari Savolainen M 24.0 172.0 64.0 Finland FIN Finland 1932 Summer 1932 Summer Gymnastics Gymnastics Men's Individual All-Around Los Angeles United States Bronze None
8 106296 Heikki Ilmari Savolainen M 24.0 172.0 64.0 Finland FIN Finland 1932 Summer 1932 Summer Gymnastics Gymnastics Men's Team All-Around Los Angeles United States Bronze None
9 106296 Heikki Ilmari Savolainen M 24.0 172.0 64.0 Finland FIN Finland 1932 Summer 1932 Summer Gymnastics Gymnastics Men's Floor Exercise Los Angeles United States None None
10 106296 Heikki Ilmari Savolainen M 24.0 172.0 64.0 Finland FIN Finland 1932 Summer 1932 Summer Gymnastics Gymnastics Men's Horse Vault Los Angeles United States None None
11 106296 Heikki Ilmari Savolainen M 24.0 172.0 64.0 Finland FIN Finland 1932 Summer 1932 Summer Gymnastics Gymnastics Men's Parallel Bars Los Angeles United States Bronze None
12 106296 Heikki Ilmari Savolainen M 24.0 172.0 64.0 Finland FIN Finland 1932 Summer 1932 Summer Gymnastics Gymnastics Men's Horizontal Bar Los Angeles United States Silver None
13 106296 Heikki Ilmari Savolainen M 24.0 172.0 64.0 Finland FIN Finland 1932 Summer 1932 Summer Gymnastics Gymnastics Men's Rings Los Angeles United States None None
14 106296 Heikki Ilmari Savolainen M 24.0 172.0 64.0 Finland FIN Finland 1932 Summer 1932 Summer Gymnastics Gymnastics Men's Pommelled Horse Los Angeles United States None None
15 106296 Heikki Ilmari Savolainen M 28.0 172.0 64.0 Finland FIN Finland 1936 Summer 1936 Summer Gymnastics Gymnastics Men's Individual All-Around Berlin Germany None None
16 106296 Heikki Ilmari Savolainen M 28.0 172.0 64.0 Finland FIN Finland 1936 Summer 1936 Summer Gymnastics Gymnastics Men's Team All-Around Berlin Germany Bronze None
17 106296 Heikki Ilmari Savolainen M 28.0 172.0 64.0 Finland FIN Finland 1936 Summer 1936 Summer Gymnastics Gymnastics Men's Floor Exercise Berlin Germany None None
18 106296 Heikki Ilmari Savolainen M 28.0 172.0 64.0 Finland FIN Finland 1936 Summer 1936 Summer Gymnastics Gymnastics Men's Horse Vault Berlin Germany None None
19 106296 Heikki Ilmari Savolainen M 28.0 172.0 64.0 Finland FIN Finland 1936 Summer 1936 Summer Gymnastics Gymnastics Men's Parallel Bars Berlin Germany None None
20 106296 Heikki Ilmari Savolainen M 28.0 172.0 64.0 Finland FIN Finland 1936 Summer 1936 Summer Gymnastics Gymnastics Men's Horizontal Bar Berlin Germany None None
21 106296 Heikki Ilmari Savolainen M 28.0 172.0 64.0 Finland FIN Finland 1936 Summer 1936 Summer Gymnastics Gymnastics Men's Rings Berlin Germany None None
22 106296 Heikki Ilmari Savolainen M 28.0 172.0 64.0 Finland FIN Finland 1936 Summer 1936 Summer Gymnastics Gymnastics Men's Pommelled Horse Berlin Germany None None
23 106296 Heikki Ilmari Savolainen M 40.0 172.0 64.0 Finland FIN Finland 1948 Summer 1948 Summer Gymnastics Gymnastics Men's Individual All-Around London United Kingdom None None
24 106296 Heikki Ilmari Savolainen M 40.0 172.0 64.0 Finland FIN Finland 1948 Summer 1948 Summer Gymnastics Gymnastics Men's Team All-Around London United Kingdom Gold None
25 106296 Heikki Ilmari Savolainen M 40.0 172.0 64.0 Finland FIN Finland 1948 Summer 1948 Summer Gymnastics Gymnastics Men's Floor Exercise London United Kingdom None None
26 106296 Heikki Ilmari Savolainen M 40.0 172.0 64.0 Finland FIN Finland 1948 Summer 1948 Summer Gymnastics Gymnastics Men's Horse Vault London United Kingdom None None
27 106296 Heikki Ilmari Savolainen M 40.0 172.0 64.0 Finland FIN Finland 1948 Summer 1948 Summer Gymnastics Gymnastics Men's Parallel Bars London United Kingdom None None
28 106296 Heikki Ilmari Savolainen M 40.0 172.0 64.0 Finland FIN Finland 1948 Summer 1948 Summer Gymnastics Gymnastics Men's Horizontal Bar London United Kingdom None None
29 106296 Heikki Ilmari Savolainen M 40.0 172.0 64.0 Finland FIN Finland 1948 Summer 1948 Summer Gymnastics Gymnastics Men's Rings London United Kingdom None None
30 106296 Heikki Ilmari Savolainen M 40.0 172.0 64.0 Finland FIN Finland 1948 Summer 1948 Summer Gymnastics Gymnastics Men's Pommelled Horse London United Kingdom Gold None
31 106296 Heikki Ilmari Savolainen M 44.0 172.0 64.0 Finland FIN Finland 1952 Summer 1952 Summer Gymnastics Gymnastics Men's Individual All-Around Helsinki Finland None None
32 106296 Heikki Ilmari Savolainen M 44.0 172.0 64.0 Finland FIN Finland 1952 Summer 1952 Summer Gymnastics Gymnastics Men's Team All-Around Helsinki Finland Bronze None
33 106296 Heikki Ilmari Savolainen M 44.0 172.0 64.0 Finland FIN Finland 1952 Summer 1952 Summer Gymnastics Gymnastics Men's Floor Exercise Helsinki Finland None None
34 106296 Heikki Ilmari Savolainen M 44.0 172.0 64.0 Finland FIN Finland 1952 Summer 1952 Summer Gymnastics Gymnastics Men's Horse Vault Helsinki Finland None None
35 106296 Heikki Ilmari Savolainen M 44.0 172.0 64.0 Finland FIN Finland 1952 Summer 1952 Summer Gymnastics Gymnastics Men's Parallel Bars Helsinki Finland None None
36 106296 Heikki Ilmari Savolainen M 44.0 172.0 64.0 Finland FIN Finland 1952 Summer 1952 Summer Gymnastics Gymnastics Men's Horizontal Bar Helsinki Finland None None
37 106296 Heikki Ilmari Savolainen M 44.0 172.0 64.0 Finland FIN Finland 1952 Summer 1952 Summer Gymnastics Gymnastics Men's Rings Helsinki Finland None None
38 106296 Heikki Ilmari Savolainen M 44.0 172.0 64.0 Finland FIN Finland 1952 Summer 1952 Summer Gymnastics Gymnastics Men's Pommelled Horse Helsinki Finland None None
In [51]:
# Most common ID/Athlete based on number of distinct games played 
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Games) as num_games,  COUNT(DISTINCT Sport) as num_sports_played
                    FROM olympics_games_distinct 
                    WHERE Games IS NOT NULL
                    GROUP BY ID, Name
                    ORDER BY num_games DESC; """))
ID Name num_games num_sports_played
0 79855 Ian Millar 10 1
1 65378 Afanasijs Kuzmins 9 1
2 99155 Hubert Raudaschl 9 1
3 14388 Francisco Boza Dibos 8 1
4 26880 Rajmond Debevec 8 1
... ... ... ... ...
135566 135564 Yevgeny Aleksandrovich Zykov 1 1
135567 135566 James Francis "Jim" Zylker 1 1
135568 135568 Olga Igorevna Zyuzkova 1 1
135569 135569 Andrzej ya 1 1
135570 135570 Piotr ya 1 1

135571 rows × 4 columns

In [52]:
# Average number of games played by athletes in the Olympics overall
display(pysqldf(""" SELECT AVG(num_games) AS avg_num_games
                    FROM(
                        SELECT ID, Name, COUNT(DISTINCT Games) as num_games,  COUNT(DISTINCT Sport) as num_sports
                        FROM olympics_games_distinct 
                        WHERE Games IS NOT NULL
                        GROUP BY ID, Name
                        ORDER BY num_games DESC) """))
avg_num_games
0 1.382685
Number of distinct sport each athlete play¶
In [53]:
# Number of games and number of sports played by each athlete
display(pysqldf(""" SELECT ID, Name,   COUNT(DISTINCT Sport) as num_sports, COUNT(DISTINCT Games) as num_games
                    FROM olympics_games_distinct 
                    WHERE Games IS NOT NULL
                    GROUP BY ID, Name
                    ORDER BY num_sports DESC""" )) 
ID Name num_sports num_games
0 6386 Karl Johan Baadsvik 4 1
1 24534 Bronisaw Czech 4 3
2 32330 Launceston Elliot 4 2
3 52387 William Archibald "Bill" Irwin 4 1
4 54456 Alexander Viggo Jensen 4 2
... ... ... ... ...
135566 135567 Aleksandr Viktorovich Zyuzin 1 2
135567 135568 Olga Igorevna Zyuzkova 1 1
135568 135569 Andrzej ya 1 1
135569 135570 Piotr ya 1 1
135570 135571 Tomasz Ireneusz ya 1 2

135571 rows × 4 columns

In [194]:
display(pysqldf(""" SELECT ID, Name, Games, Sport
                    FROM olympics_games_distinct 
                    WHERE ID = 6386
                    """ )) 
ID Name Games Sport
0 6386 Karl Johan Baadsvik 1936 Winter Ski Jumping
1 6386 Karl Johan Baadsvik 1936 Winter Cross Country Skiing
2 6386 Karl Johan Baadsvik 1936 Winter Nordic Combined
3 6386 Karl Johan Baadsvik 1936 Winter Alpine Skiing
In [55]:
# How many games do athletes in the Olympics commonly play? Do they tend to go back for another game? 
# Observation: only 1 athlete played 10 different olympics games 
display(pysqldf(""" 
                        SELECT
                            num_games,
                            COUNT(*) as athlete_cnt
                        FROM(
                                SELECT ID, Name, COUNT(DISTINCT Games) as num_games
                                FROM olympics_games_distinct 
                                WHERE Games IS NOT NULL
                                GROUP BY ID, Name)
                        GROUP BY num_games  """ )) 
num_games athlete_cnt
0 1 98450
1 2 26198
2 3 8051
3 4 2146
4 5 545
5 6 139
6 7 30
7 8 9
8 9 2
9 10 1
In [56]:
# get separate male and female records 
female_df = pysqldf("""SELECT * FROM olympics_games_distinct WHERE Sex = 'F'; """)
male_df = pysqldf("""SELECT * FROM olympics_games_distinct WHERE Sex = 'M'; """)
In [191]:
descriptive_stats('female_df', 'female_df.Weight')
**************************************************
Table Name: female_df
Table Column: female_df.Weight
**************************************************

Value
Stat
COUNT 66585.0
MEAN 60.0
MEDIAN 58.0
MIN 25.0
MAX 167.0
Value Count
Stat
MODE 60.0 4202

In [192]:
descriptive_stats('male_df', 'male_df.Weight')
**************************************************
Table Name: male_df
Table Column: male_df.Weight
**************************************************

Value
Stat
COUNT 140561.0
MEAN 75.0
MEDIAN 69.0
MIN 28.0
MAX 214.0
Value Count
Stat
MODE 70.0 7603

In [195]:
get_iqrs('female_df', 'Height')
pct last_val
0 0.25 160.0
1 0.50 167.0
2 0.75 173.0
3 1.00 213.0
In [197]:
get_iqrs('male_df', 'Height')
pct last_val
0 0.25 NaN
1 0.50 175.0
2 0.75 183.0
3 1.00 226.0
In [57]:
# number of specific sports each athlete participated in 
display(pysqldf(""" SELECT 
                            ID, Name, COUNT(DISTINCT Sport) as num_sports 
                    FROM olympics_games_distinct 
                    WHERE Sport IS NOT NULL
                    GROUP BY ID, Name
                    ORDER BY num_sports DESC; """))
ID Name num_sports
0 6386 Karl Johan Baadsvik 4
1 24534 Bronisaw Czech 4
2 32330 Launceston Elliot 4
3 52387 William Archibald "Bill" Irwin 4
4 54456 Alexander Viggo Jensen 4
... ... ... ...
135566 135567 Aleksandr Viktorovich Zyuzin 1
135567 135568 Olga Igorevna Zyuzkova 1
135568 135569 Andrzej ya 1
135569 135570 Piotr ya 1
135570 135571 Tomasz Ireneusz ya 1

135571 rows × 3 columns

In [58]:
# number of sports each female athlete participate in 
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Sport) as num_sports 
                    FROM female_df 
                    WHERE Sport IS NOT NULL
                    GROUP BY ID, Name
                    ORDER BY num_sports DESC"""))
ID Name num_sports
0 82613 Jaqueline Mouro 3
1 110716 Georgia Simmerling 3
2 118525 Sheila Christine Taormina 3
3 968 Margarete "Grete" Adler 2
4 3615 Ana Paula Rodrigues Connelly (-Rodrigues Henkel) 2
... ... ... ...
33976 135550 Vera Vasilyevna Zyatikova 1
33977 135553 Galina Ivanovna Zybina (-Fyodorova) 1
33978 135560 Stavroula Zygouri 1
33979 135563 Olesya Nikolayevna Zykina 1
33980 135568 Olga Igorevna Zyuzkova 1

33981 rows × 3 columns

In [59]:
# number female athletes who participate in more than 1 sport 
display(pysqldf(""" SELECT COUNT(*) AS female_2ormore 
                    FROM(
                        SELECT ID, Name, COUNT(DISTINCT Sport) as num_sports 
                        FROM female_df 
                        WHERE Sport IS NOT NULL
                        GROUP BY ID, Name
                        ORDER BY num_sports DESC)
                    WHERE num_sports > 1"""))
female_2ormore
0 117
In [60]:
# number of sports each male athlete participate in 
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Sport) as num_sports 
                    FROM male_df 
                    WHERE Sport IS NOT NULL
                    GROUP BY ID, Name
                    ORDER BY num_sports DESC; """))
ID Name num_sports
0 6386 Karl Johan Baadsvik 4
1 24534 Bronisaw Czech 4
2 32330 Launceston Elliot 4
3 52387 William Archibald "Bill" Irwin 4
4 54456 Alexander Viggo Jensen 4
... ... ... ...
101585 135566 James Francis "Jim" Zylker 1
101586 135567 Aleksandr Viktorovich Zyuzin 1
101587 135569 Andrzej ya 1
101588 135570 Piotr ya 1
101589 135571 Tomasz Ireneusz ya 1

101590 rows × 3 columns

In [61]:
# number male athletes who play more than 1 sport 
display(pysqldf(""" SELECT COUNT(*) AS male_2ormore 
                    FROM(
                        SELECT ID, Name, COUNT(DISTINCT Sport) as num_sports 
                        FROM male_df 
                        WHERE Sport IS NOT NULL
                        GROUP BY ID, Name
                        ORDER BY num_sports DESC)
                    WHERE num_sports > 1"""))
male_2ormore
0 891
Check if there are any athlete labeled as both male and female¶
In [62]:
display(pysqldf("""SELECT * FROM male_df WHERE Sex = 'F'; """))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
In [63]:
display(pysqldf("""SELECT * FROM female_df WHERE Sex = 'M'; """))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
In [64]:
# Players with two genders (None) 
display(pysqldf(""" SELECT 
                        ID, Name, COUNT(DISTINCT Sex) AS gender_count 
                    FROM olympics_games_distinct 
                    GROUP BY ID, Name
                    ORDER BY gender_count DESC; """))
ID Name gender_count
0 1 A Dijiang 1
1 2 A Lamusi 1
2 3 Gunnar Nielsen Aaby 1
3 4 Edgar Lindenau Aabye 1
4 5 Christine Jacoba Aaftink 1
... ... ... ...
135566 135567 Aleksandr Viktorovich Zyuzin 1
135567 135568 Olga Igorevna Zyuzkova 1
135568 135569 Andrzej ya 1
135569 135570 Piotr ya 1
135570 135571 Tomasz Ireneusz ya 1

135571 rows × 3 columns

Region with athletes who play different sports¶
In [65]:
# number of sports played by every atlete in each region 
display(pysqldf(""" SELECT Region, ID, Name, COUNT(DISTINCT Sport) as num_sports 
                    FROM olympics_games_distinct  
                    WHERE Sport IS NOT NULL
                    GROUP BY Region, ID, Name
                    ORDER BY num_sports DESC"""))
Region ID Name num_sports
0 Canada 6386 Karl Johan Baadsvik 4
1 Canada 52387 William Archibald "Bill" Irwin 4
2 Denmark 54456 Alexander Viggo Jensen 4
3 Germany 107613 Carl Schuhmann 4
4 Poland 24534 Bronisaw Czech 4
... ... ... ... ...
136304 Zimbabwe 130453 Antonette Wilken (-Batchelor) 1
136305 Zimbabwe 130832 Hillary Wilson 1
136306 Zimbabwe 130880 Peter Arthur Wilson 1
136307 Zimbabwe 131478 Jennifer "Jenny" Wood 1
136308 Zimbabwe 135497 Lloyd Zvasiya 1

136309 rows × 4 columns

In [66]:
# region with the most versatile athletes based on the number of sports they play
display(pysqldf(""" SELECT Region, COUNT(*) AS ath_count 
                    FROM(
                        SELECT Region, ID, Name, COUNT(DISTINCT Sport) as num_sports 
                        FROM olympics_games_distinct  
                        WHERE Sport IS NOT NULL
                        GROUP BY Region, ID, Name
                        ORDER BY num_sports DESC)
                    WHERE num_sports > 1
                    GROUP BY Region
                    ORDER BY ath_count DESC"""))
Region ath_count
0 United States 120
1 Sweden 67
2 France 65
3 United Kingdom 50
4 Germany 49
5 Austria 48
6 Hungary 47
7 Finland 46
8 Norway 37
9 Belgium 37
10 Greece 36
11 Switzerland 35
12 Canada 29
13 Japan 27
14 Italy 27
15 Czech Republic 27
16 Netherlands 25
17 Poland 23
18 Brazil 18
19 Australia 18
20 Russia 11
21 Denmark 10
22 China 9
23 Spain 8
24 Romania 8
25 Argentina 8
26 Serbia 7
27 Mexico 7
28 Egypt 7
29 Portugal 6
30 Latvia 6
31 Slovakia 5
32 Moldova 5
33 India 5
34 Taiwan 4
35 New Zealand 4
36 Monaco 4
37 Luxembourg 4
38 Estonia 4
39 Croatia 4
40 Bulgaria 4
41 Bahrain 3
42 Turkey 2
43 South Korea 2
44 South Africa 2
45 Slovenia 2
46 Puerto Rico 2
47 Mongolia 2
48 Lebanon 2
49 Ireland 2
50 Curacao 2
51 Virgin Islands, US 1
52 Philippines 1
53 Pakistan 1
54 Morocco 1
55 Lithuania 1
56 Liechtenstein 1
57 Iraq 1
58 Iran 1
59 Iceland 1
60 Guatemala 1
61 Costa Rica 1
62 Colombia 1
63 Chile 1
64 Bosnia and Herzegovina 1
65 Boliva 1
66 Bermuda 1
67 Belize 1
68 Belarus 1
69 American Samoa 1

Percent difference¶

In [67]:
# Percent difference between total number of male and female athletes in the Olympics  
display(pysqldf(""" SELECT Sex, COUNT(DISTINCT ID) AS count 
                    FROM olympics_games_distinct
                    GROUP BY Sex; """))
Sex count
0 F 33981
1 M 101590
In [68]:
# Percent difference between total number of male and female athletes  
((abs(101590-33981)) / ((101590+33981)/2))*100
Out[68]:
99.73961983019967

Hypothesis 1 tables¶

Metric 1: Total number of medals¶
In [69]:
# add team_type column (away or home)
olympics_games_distdf = pysqldf(""" 
                                    SELECT 
                                          ID,Name,Sex,Age,Height,Weight,Team,NOC,Region,Games,Year,Season,Sport,Event,City,Country,Medal,Notes, 
                                          (CASE WHEN Region != Country THEN 'away' 
                                                WHEN Region = Country THEN 'home'
                                                END) AS TeamType
                                    FROM olympics_games_distinct; 
                                """)
olympics_games_distdf
Out[69]:
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes TeamType
0 1 A Dijiang M 24.0 180.0 80.0 China CHN China 1992 Summer 1992 Summer Basketball Basketball Men's Basketball Barcelona Spain None None away
1 2 A Lamusi M 23.0 170.0 60.0 China CHN China 2012 Summer 2012 Summer Judo Judo Men's Extra-Lightweight London United Kingdom None None away
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark DEN Denmark 1920 Summer 1920 Summer Football Football Men's Football Antwerpen Belgium None None away
3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden DEN Denmark 1900 Summer 1900 Summer Tug-Of-War Tug-Of-War Men's Tug-Of-War Paris France Gold None away
4 5 Christine Jacoba Aaftink F 21.0 185.0 82.0 Netherlands NED Netherlands 1988 Winter 1988 Winter Speed Skating Speed Skating Women's 500 metres Calgary Canada None None away
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
269726 135569 Andrzej ya M 29.0 179.0 89.0 Poland-1 POL Poland 1976 Winter 1976 Winter Luge Luge Mixed (Men)'s Doubles Innsbruck Austria None None away
269727 135570 Piotr ya M 27.0 176.0 59.0 Poland POL Poland 2014 Winter 2014 Winter Ski Jumping Ski Jumping Men's Large Hill, Individual Sochi Russia None None away
269728 135570 Piotr ya M 27.0 176.0 59.0 Poland POL Poland 2014 Winter 2014 Winter Ski Jumping Ski Jumping Men's Large Hill, Team Sochi Russia None None away
269729 135571 Tomasz Ireneusz ya M 30.0 185.0 96.0 Poland POL Poland 1998 Winter 1998 Winter Bobsleigh Bobsleigh Men's Four Nagano Japan None None away
269730 135571 Tomasz Ireneusz ya M 34.0 185.0 96.0 Poland POL Poland 2002 Winter 2002 Winter Bobsleigh Bobsleigh Men's Four Salt Lake City United States None None away

269731 rows × 19 columns

In [70]:
# number of home and away teams in every game
display(pysqldf("""  WITH hometeams AS( SELECT Games, COUNT(DISTINCT Team) AS home_team_cnt 
                                                      FROM olympics_games_distinct
                                                      WHERE Region=Country
                                                      GROUP BY Games)
                                    SELECT *
                                    FROM hometeams
                                    LEFT JOIN( SELECT Games, COUNT(DISTINCT Team) AS away_team_cnt 
                                               FROM olympics_games_distinct
                                               WHERE Region!=Country
                                               GROUP BY Games)
                                    USING(Games)
                                """))
Games home_team_cnt away_team_cnt
0 1896 Summer 5 13
1 1900 Summer 122 77
2 1904 Summer 59 27
3 1906 Summer 21 33
4 1908 Summer 34 39
5 1912 Summer 17 84
6 1920 Summer 9 63
7 1924 Summer 5 85
8 1924 Winter 3 25
9 1928 Summer 3 64
10 1928 Winter 3 38
11 1932 Summer 4 55
12 1932 Winter 3 26
13 1936 Summer 5 100
14 1936 Winter 3 51
15 1948 Summer 5 106
16 1948 Winter 3 43
17 1952 Summer 5 128
18 1952 Winter 3 49
19 1956 Summer 6 118
20 1956 Winter 3 53
21 1960 Summer 5 181
22 1960 Winter 4 36
23 1964 Summer 5 163
24 1964 Winter 3 63
25 1968 Summer 1 111
26 1968 Winter 3 67
27 1972 Summer 8 131
28 1972 Winter 3 60
29 1976 Summer 1 91
30 1976 Winter 3 67
31 1980 Summer 1 79
32 1980 Winter 3 55
33 1984 Summer 1 139
34 1984 Winter 3 77
35 1988 Summer 3 174
36 1988 Winter 4 94
37 1992 Summer 1 211
38 1992 Winter 4 107
39 1994 Winter 1 100
40 1996 Summer 4 242
41 1998 Winter 3 103
42 2000 Summer 4 239
43 2002 Winter 3 111
44 2004 Summer 3 257
45 2006 Winter 3 110
46 2008 Summer 5 287
47 2010 Winter 3 113
48 2012 Summer 3 242
49 2014 Winter 4 115
50 2016 Summer 3 245
In [71]:
# combined medal count for home and away teams per Olympics game 
display(pysqldf("""   SELECT *
                      FROM( SELECT Games, COUNT(Medal) AS home_medal_count 
                            FROM olympics_games_distdf
                            WHERE TeamType = 'home'
                            GROUP BY Games)
                     LEFT JOIN( SELECT Games, COUNT(Medal) AS away_medal_count 
                                FROM olympics_games_distdf
                                WHERE TeamType = 'away'
                                GROUP BY Games)
                    USING(Games); 
                """))
Games home_medal_count away_medal_count
0 1896 Summer 48 95
1 1900 Summer 225 368
2 1904 Summer 394 92
3 1906 Summer 102 356
4 1908 Summer 368 463
5 1912 Summer 190 751
6 1920 Summer 188 1120
7 1924 Summer 110 722
8 1924 Winter 10 120
9 1928 Summer 57 677
10 1928 Winter 12 77
11 1932 Summer 189 458
12 1932 Winter 34 58
13 1936 Summer 224 693
14 1936 Winter 7 101
15 1948 Summer 61 791
16 1948 Winter 28 107
17 1952 Summer 40 857
18 1952 Winter 19 117
19 1956 Summer 72 821
20 1956 Winter 8 142
21 1960 Summer 88 823
22 1960 Winter 27 120
23 1964 Summer 62 967
24 1964 Winter 17 169
25 1968 Summer 9 1048
26 1968 Winter 9 190
27 1972 Summer 253 962
28 1972 Winter 3 196
29 1976 Summer 23 1297
30 1976 Winter 7 204
31 1980 Summer 442 942
32 1980 Winter 30 188
33 1984 Summer 352 1124
34 1984 Winter 1 221
35 1988 Summer 77 1505
36 1988 Winter 6 257
37 1992 Summer 69 1643
38 1992 Winter 12 306
39 1994 Winter 30 301
40 1996 Summer 259 1583
41 1998 Winter 13 427
42 2000 Summer 183 1821
43 2002 Winter 84 394
44 2004 Summer 31 1970
45 2006 Winter 25 501
46 2008 Summer 184 1864
47 2010 Winter 90 430
48 2012 Summer 126 1815
49 2014 Winter 68 529
50 2016 Summer 50 1973
In [72]:
# medal count for home and away teams per Olympics game 
medalcnt_awayhome_df = pysqldf("""  WITH teams_count AS(SELECT *
                                                        FROM (SELECT Games, COUNT(DISTINCT Team) AS home_team_cnt 
                                                              FROM olympics_games_distinct
                                                              WHERE Region=Country
                                                              GROUP BY Games)
                                                        LEFT JOIN( SELECT Games, COUNT(DISTINCT Team) AS away_team_cnt 
                                                                   FROM olympics_games_distinct
                                                                   WHERE Region!=Country
                                                                   GROUP BY Games)
                                                        USING(Games) ) 
                                    SELECT * 
                                    FROM teams_count
                                    LEFT JOIN (
                                                  SELECT *
                                                  FROM( SELECT Games, COUNT(Medal) AS home_medal_count 
                                                        FROM olympics_games_distdf
                                                        WHERE TeamType = 'home'
                                                        GROUP BY Games)
                                                  LEFT JOIN( SELECT Games, COUNT(Medal) AS away_medal_count 
                                                            FROM olympics_games_distdf
                                                            WHERE TeamType = 'away'
                                                            GROUP BY Games)
                                                 USING(Games)
                                                )
                                    USING(Games)
                """)
medalcnt_awayhome_df
Out[72]:
Games home_team_cnt away_team_cnt home_medal_count away_medal_count
0 1896 Summer 5 13 48 95
1 1900 Summer 122 77 225 368
2 1904 Summer 59 27 394 92
3 1906 Summer 21 33 102 356
4 1908 Summer 34 39 368 463
5 1912 Summer 17 84 190 751
6 1920 Summer 9 63 188 1120
7 1924 Summer 5 85 110 722
8 1924 Winter 3 25 10 120
9 1928 Summer 3 64 57 677
10 1928 Winter 3 38 12 77
11 1932 Summer 4 55 189 458
12 1932 Winter 3 26 34 58
13 1936 Summer 5 100 224 693
14 1936 Winter 3 51 7 101
15 1948 Summer 5 106 61 791
16 1948 Winter 3 43 28 107
17 1952 Summer 5 128 40 857
18 1952 Winter 3 49 19 117
19 1956 Summer 6 118 72 821
20 1956 Winter 3 53 8 142
21 1960 Summer 5 181 88 823
22 1960 Winter 4 36 27 120
23 1964 Summer 5 163 62 967
24 1964 Winter 3 63 17 169
25 1968 Summer 1 111 9 1048
26 1968 Winter 3 67 9 190
27 1972 Summer 8 131 253 962
28 1972 Winter 3 60 3 196
29 1976 Summer 1 91 23 1297
30 1976 Winter 3 67 7 204
31 1980 Summer 1 79 442 942
32 1980 Winter 3 55 30 188
33 1984 Summer 1 139 352 1124
34 1984 Winter 3 77 1 221
35 1988 Summer 3 174 77 1505
36 1988 Winter 4 94 6 257
37 1992 Summer 1 211 69 1643
38 1992 Winter 4 107 12 306
39 1994 Winter 1 100 30 301
40 1996 Summer 4 242 259 1583
41 1998 Winter 3 103 13 427
42 2000 Summer 4 239 183 1821
43 2002 Winter 3 111 84 394
44 2004 Summer 3 257 31 1970
45 2006 Winter 3 110 25 501
46 2008 Summer 5 287 184 1864
47 2010 Winter 3 113 90 430
48 2012 Summer 3 242 126 1815
49 2014 Winter 4 115 68 529
50 2016 Summer 3 245 50 1973
In [73]:
medalavg_awayhome_df = pysqldf(""" SELECT 
                                        Games, home_team_cnt,away_team_cnt,home_medal_count,away_medal_count,
                                        CAST(home_medal_count AS float) / CAST(home_team_cnt AS float) AS home_avgmedal, 
                                        CAST(away_medal_count AS float) / CAST(away_team_cnt AS float) AS away_avgmedal 
                                   FROM medalcnt_awayhome_df; """)
medalavg_awayhome_df
Out[73]:
Games home_team_cnt away_team_cnt home_medal_count away_medal_count home_avgmedal away_avgmedal
0 1896 Summer 5 13 48 95 9.600000 7.307692
1 1900 Summer 122 77 225 368 1.844262 4.779221
2 1904 Summer 59 27 394 92 6.677966 3.407407
3 1906 Summer 21 33 102 356 4.857143 10.787879
4 1908 Summer 34 39 368 463 10.823529 11.871795
5 1912 Summer 17 84 190 751 11.176471 8.940476
6 1920 Summer 9 63 188 1120 20.888889 17.777778
7 1924 Summer 5 85 110 722 22.000000 8.494118
8 1924 Winter 3 25 10 120 3.333333 4.800000
9 1928 Summer 3 64 57 677 19.000000 10.578125
10 1928 Winter 3 38 12 77 4.000000 2.026316
11 1932 Summer 4 55 189 458 47.250000 8.327273
12 1932 Winter 3 26 34 58 11.333333 2.230769
13 1936 Summer 5 100 224 693 44.800000 6.930000
14 1936 Winter 3 51 7 101 2.333333 1.980392
15 1948 Summer 5 106 61 791 12.200000 7.462264
16 1948 Winter 3 43 28 107 9.333333 2.488372
17 1952 Summer 5 128 40 857 8.000000 6.695312
18 1952 Winter 3 49 19 117 6.333333 2.387755
19 1956 Summer 6 118 72 821 12.000000 6.957627
20 1956 Winter 3 53 8 142 2.666667 2.679245
21 1960 Summer 5 181 88 823 17.600000 4.546961
22 1960 Winter 4 36 27 120 6.750000 3.333333
23 1964 Summer 5 163 62 967 12.400000 5.932515
24 1964 Winter 3 63 17 169 5.666667 2.682540
25 1968 Summer 1 111 9 1048 9.000000 9.441441
26 1968 Winter 3 67 9 190 3.000000 2.835821
27 1972 Summer 8 131 253 962 31.625000 7.343511
28 1972 Winter 3 60 3 196 1.000000 3.266667
29 1976 Summer 1 91 23 1297 23.000000 14.252747
30 1976 Winter 3 67 7 204 2.333333 3.044776
31 1980 Summer 1 79 442 942 442.000000 11.924051
32 1980 Winter 3 55 30 188 10.000000 3.418182
33 1984 Summer 1 139 352 1124 352.000000 8.086331
34 1984 Winter 3 77 1 221 0.333333 2.870130
35 1988 Summer 3 174 77 1505 25.666667 8.649425
36 1988 Winter 4 94 6 257 1.500000 2.734043
37 1992 Summer 1 211 69 1643 69.000000 7.786730
38 1992 Winter 4 107 12 306 3.000000 2.859813
39 1994 Winter 1 100 30 301 30.000000 3.010000
40 1996 Summer 4 242 259 1583 64.750000 6.541322
41 1998 Winter 3 103 13 427 4.333333 4.145631
42 2000 Summer 4 239 183 1821 45.750000 7.619247
43 2002 Winter 3 111 84 394 28.000000 3.549550
44 2004 Summer 3 257 31 1970 10.333333 7.665370
45 2006 Winter 3 110 25 501 8.333333 4.554545
46 2008 Summer 5 287 184 1864 36.800000 6.494774
47 2010 Winter 3 113 90 430 30.000000 3.805310
48 2012 Summer 3 242 126 1815 42.000000 7.500000
49 2014 Winter 4 115 68 529 17.000000 4.600000
50 2016 Summer 3 245 50 1973 16.666667 8.053061
In [74]:
#Count number of  home and away distinct atheletes in each game 
homeaway_count_df = pysqldf(""" 
                    SELECT Games, 
                           COUNT(CASE WHEN Region != Country THEN 1 END) AS away_athletes_count,  
                           COUNT(CASE WHEN Region = Country THEN 1 END) AS home_athletes_count
                    FROM (  SELECT DISTINCT Games, Team, Region, Country, ID
                            FROM olympics_games_distinct)
                    GROUP BY Games; 
                """)
homeaway_count_df
Out[74]:
Games away_athletes_count home_athletes_count
0 1896 Summer 78 109
1 1900 Summer 544 857
2 1904 Summer 135 702
3 1906 Summer 536 338
4 1908 Summer 1297 768
5 1912 Summer 2035 467
6 1920 Summer 2379 348
7 1924 Summer 2924 407
8 1924 Winter 274 46
9 1928 Summer 2982 266
10 1928 Winter 428 41
11 1932 Summer 1451 474
12 1932 Winter 195 64
13 1936 Summer 4051 433
14 1936 Winter 621 56
15 1948 Summer 3995 407
16 1948 Winter 611 73
17 1952 Summer 4674 258
18 1952 Winter 633 74
19 1956 Summer 3044 303
20 1956 Winter 772 65
21 1960 Summer 5072 280
22 1960 Winter 586 79
23 1964 Summer 4809 328
24 1964 Winter 1049 86
25 1968 Summer 5283 275
26 1968 Winter 1113 77
27 1972 Summer 6393 721
28 1972 Winter 957 90
29 1976 Summer 5688 385
30 1976 Winter 1084 82
31 1980 Summer 4770 489
32 1980 Winter 991 102
33 1984 Summer 6276 522
34 1984 Winter 1225 74
35 1988 Summer 8075 404
36 1988 Winter 1356 114
37 1992 Summer 9012 422
38 1992 Winter 1735 110
39 1994 Winter 1668 87
40 1996 Summer 9761 648
41 1998 Winter 2045 157
42 2000 Summer 10081 629
43 2002 Winter 2221 202
44 2004 Summer 10226 426
45 2006 Winter 2326 180
46 2008 Summer 10353 636
47 2010 Winter 2347 201
48 2012 Summer 10038 537
49 2014 Winter 2590 223
50 2016 Summer 10746 464
Number of home and away teams in every game¶
In [75]:
#count number of games where  home medals > away team medals
display(pysqldf("""
                    SELECT COUNT(*) AS count 
                    FROM medalavg_awayhome_df
                    WHERE home_medal_count < away_medal_count
                """))
count
0 50
In [76]:
#count number of games where home medals > away team medals
display(pysqldf("""
                    SELECT COUNT(*) AS count 
                    FROM medalavg_awayhome_df
                    WHERE home_medal_count > away_medal_count
                """))
count
0 1
In [77]:
#count number of games where avg home team medals < away team medals
display(pysqldf("""
                    SELECT *
                    FROM medalavg_awayhome_df
                    WHERE home_avgmedal < away_avgmedal
                """))
Games home_team_cnt away_team_cnt home_medal_count away_medal_count home_avgmedal away_avgmedal
0 1900 Summer 122 77 225 368 1.844262 4.779221
1 1906 Summer 21 33 102 356 4.857143 10.787879
2 1908 Summer 34 39 368 463 10.823529 11.871795
3 1924 Winter 3 25 10 120 3.333333 4.800000
4 1956 Winter 3 53 8 142 2.666667 2.679245
5 1968 Summer 1 111 9 1048 9.000000 9.441441
6 1972 Winter 3 60 3 196 1.000000 3.266667
7 1976 Winter 3 67 7 204 2.333333 3.044776
8 1984 Winter 3 77 1 221 0.333333 2.870130
9 1988 Winter 4 94 6 257 1.500000 2.734043
In [78]:
#count games where avg home team medals > away team medals
display(pysqldf("""
                    SELECT *
                    FROM medalavg_awayhome_df
                    WHERE home_avgmedal > away_avgmedal
                """))
Games home_team_cnt away_team_cnt home_medal_count away_medal_count home_avgmedal away_avgmedal
0 1896 Summer 5 13 48 95 9.600000 7.307692
1 1904 Summer 59 27 394 92 6.677966 3.407407
2 1912 Summer 17 84 190 751 11.176471 8.940476
3 1920 Summer 9 63 188 1120 20.888889 17.777778
4 1924 Summer 5 85 110 722 22.000000 8.494118
5 1928 Summer 3 64 57 677 19.000000 10.578125
6 1928 Winter 3 38 12 77 4.000000 2.026316
7 1932 Summer 4 55 189 458 47.250000 8.327273
8 1932 Winter 3 26 34 58 11.333333 2.230769
9 1936 Summer 5 100 224 693 44.800000 6.930000
10 1936 Winter 3 51 7 101 2.333333 1.980392
11 1948 Summer 5 106 61 791 12.200000 7.462264
12 1948 Winter 3 43 28 107 9.333333 2.488372
13 1952 Summer 5 128 40 857 8.000000 6.695312
14 1952 Winter 3 49 19 117 6.333333 2.387755
15 1956 Summer 6 118 72 821 12.000000 6.957627
16 1960 Summer 5 181 88 823 17.600000 4.546961
17 1960 Winter 4 36 27 120 6.750000 3.333333
18 1964 Summer 5 163 62 967 12.400000 5.932515
19 1964 Winter 3 63 17 169 5.666667 2.682540
20 1968 Winter 3 67 9 190 3.000000 2.835821
21 1972 Summer 8 131 253 962 31.625000 7.343511
22 1976 Summer 1 91 23 1297 23.000000 14.252747
23 1980 Summer 1 79 442 942 442.000000 11.924051
24 1980 Winter 3 55 30 188 10.000000 3.418182
25 1984 Summer 1 139 352 1124 352.000000 8.086331
26 1988 Summer 3 174 77 1505 25.666667 8.649425
27 1992 Summer 1 211 69 1643 69.000000 7.786730
28 1992 Winter 4 107 12 306 3.000000 2.859813
29 1994 Winter 1 100 30 301 30.000000 3.010000
30 1996 Summer 4 242 259 1583 64.750000 6.541322
31 1998 Winter 3 103 13 427 4.333333 4.145631
32 2000 Summer 4 239 183 1821 45.750000 7.619247
33 2002 Winter 3 111 84 394 28.000000 3.549550
34 2004 Summer 3 257 31 1970 10.333333 7.665370
35 2006 Winter 3 110 25 501 8.333333 4.554545
36 2008 Summer 5 287 184 1864 36.800000 6.494774
37 2010 Winter 3 113 90 430 30.000000 3.805310
38 2012 Summer 3 242 126 1815 42.000000 7.500000
39 2014 Winter 4 115 68 529 17.000000 4.600000
40 2016 Summer 3 245 50 1973 16.666667 8.053061
In [79]:
#count number of games where avg home medals < away team medals
display(pysqldf("""
                    SELECT *
                    FROM medalavg_awayhome_df
                    WHERE home_avgmedal < away_avgmedal
                """))
Games home_team_cnt away_team_cnt home_medal_count away_medal_count home_avgmedal away_avgmedal
0 1900 Summer 122 77 225 368 1.844262 4.779221
1 1906 Summer 21 33 102 356 4.857143 10.787879
2 1908 Summer 34 39 368 463 10.823529 11.871795
3 1924 Winter 3 25 10 120 3.333333 4.800000
4 1956 Winter 3 53 8 142 2.666667 2.679245
5 1968 Summer 1 111 9 1048 9.000000 9.441441
6 1972 Winter 3 60 3 196 1.000000 3.266667
7 1976 Winter 3 67 7 204 2.333333 3.044776
8 1984 Winter 3 77 1 221 0.333333 2.870130
9 1988 Winter 4 94 6 257 1.500000 2.734043
Metric 2: Total number of gold medals¶
In [213]:
# medal count for home and away teams per Olympics game 
goldcnt_awayhome_df = pysqldf("""  WITH teams_count AS(SELECT *
                                                        FROM (SELECT Games, COUNT(DISTINCT Team) AS home_team_cnt 
                                                              FROM olympics_games_distinct
                                                              WHERE Region=Country
                                                              GROUP BY Games)
                                                        LEFT JOIN( SELECT Games, COUNT(DISTINCT Team) AS away_team_cnt 
                                                                   FROM olympics_games_distinct
                                                                   WHERE Region!=Country
                                                                   GROUP BY Games)
                                                        USING(Games) ) 
                                    SELECT * 
                                    FROM teams_count
                                    LEFT JOIN (
                                                  SELECT *
                                                  FROM( SELECT Games, COUNT(Medal) AS away_gold_count 
                                                        FROM olympics_games_distdf
                                                        WHERE (Medal = "Gold") AND (TeamType = 'away')
                                                        GROUP BY Games)
                                                  LEFT JOIN( SELECT Games, CAST(COUNT(Medal) AS INT) AS home_gold_count 
                                                            FROM olympics_games_distdf
                                                            WHERE  (Medal = "Gold") AND (TeamType = 'home')
                                                            GROUP BY Games)
                                                 USING(Games)
                                                )
                                    USING(Games)
                """)
goldcnt_awayhome_df
Out[213]:
Games home_team_cnt away_team_cnt away_gold_count home_gold_count
0 1896 Summer 5 13 52 10.0
1 1900 Summer 122 77 148 50.0
2 1904 Summer 59 27 45 128.0
3 1906 Summer 21 33 133 24.0
4 1908 Summer 34 39 147 147.0
5 1912 Summer 17 84 223 103.0
6 1920 Summer 9 63 436 57.0
7 1924 Summer 5 85 240 37.0
8 1924 Winter 3 25 55 NaN
9 1928 Summer 3 64 225 20.0
10 1928 Winter 3 38 30 NaN
11 1932 Summer 4 55 148 81.0
12 1932 Winter 3 26 22 10.0
13 1936 Summer 5 100 219 93.0
14 1936 Winter 3 51 32 4.0
15 1948 Summer 5 106 282 7.0
16 1948 Winter 3 43 37 4.0
17 1952 Summer 5 128 298 8.0
18 1952 Winter 3 49 38 7.0
19 1956 Summer 6 118 269 33.0
20 1956 Winter 3 53 49 2.0
21 1960 Summer 5 181 273 36.0
22 1960 Winter 4 36 31 19.0
23 1964 Summer 5 163 315 32.0
24 1964 Winter 3 63 56 5.0
25 1968 Summer 1 111 356 3.0
26 1968 Winter 3 67 62 4.0
27 1972 Summer 8 131 330 74.0
28 1972 Winter 3 60 69 1.0
29 1976 Summer 1 91 438 NaN
30 1976 Winter 3 67 68 2.0
31 1980 Summer 1 79 270 187.0
32 1980 Winter 3 55 48 24.0
33 1984 Summer 1 139 311 186.0
34 1984 Winter 3 77 74 NaN
35 1988 Summer 3 174 492 28.0
36 1988 Winter 4 94 87 NaN
37 1992 Summer 1 211 511 48.0
38 1992 Winter 4 107 99 5.0
39 1994 Winter 1 100 102 8.0
40 1996 Summer 4 242 449 159.0
41 1998 Winter 3 103 137 8.0
42 2000 Summer 4 239 603 60.0
43 2002 Winter 3 111 151 11.0
44 2004 Summer 3 257 656 8.0
45 2006 Winter 3 110 165 11.0
46 2008 Summer 5 287 597 74.0
47 2010 Winter 3 113 107 67.0
48 2012 Summer 3 242 584 48.0
49 2014 Winter 4 115 169 33.0
50 2016 Summer 3 245 629 36.0
In [214]:
# check Nulls in results above 
display(pysqldf("""SELECT * FROM olympics_games_distdf WHERE Games = '1924 Winter' AND Medal = 'Gold' AND TeamType = 'home'"""))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes TeamType
In [215]:
# Replace NaNs to 0 
goldcnt_awayhome_df = pysqldf(""" SELECT Games,home_team_cnt,away_team_cnt,away_gold_count,
                                          (CASE WHEN home_gold_count IS NULL THEN 0 ELSE home_gold_count END) AS home_gold_count
                                   FROM goldcnt_awayhome_df
                                """)
goldcnt_awayhome_df
Out[215]:
Games home_team_cnt away_team_cnt away_gold_count home_gold_count
0 1896 Summer 5 13 52 10.0
1 1900 Summer 122 77 148 50.0
2 1904 Summer 59 27 45 128.0
3 1906 Summer 21 33 133 24.0
4 1908 Summer 34 39 147 147.0
5 1912 Summer 17 84 223 103.0
6 1920 Summer 9 63 436 57.0
7 1924 Summer 5 85 240 37.0
8 1924 Winter 3 25 55 0.0
9 1928 Summer 3 64 225 20.0
10 1928 Winter 3 38 30 0.0
11 1932 Summer 4 55 148 81.0
12 1932 Winter 3 26 22 10.0
13 1936 Summer 5 100 219 93.0
14 1936 Winter 3 51 32 4.0
15 1948 Summer 5 106 282 7.0
16 1948 Winter 3 43 37 4.0
17 1952 Summer 5 128 298 8.0
18 1952 Winter 3 49 38 7.0
19 1956 Summer 6 118 269 33.0
20 1956 Winter 3 53 49 2.0
21 1960 Summer 5 181 273 36.0
22 1960 Winter 4 36 31 19.0
23 1964 Summer 5 163 315 32.0
24 1964 Winter 3 63 56 5.0
25 1968 Summer 1 111 356 3.0
26 1968 Winter 3 67 62 4.0
27 1972 Summer 8 131 330 74.0
28 1972 Winter 3 60 69 1.0
29 1976 Summer 1 91 438 0.0
30 1976 Winter 3 67 68 2.0
31 1980 Summer 1 79 270 187.0
32 1980 Winter 3 55 48 24.0
33 1984 Summer 1 139 311 186.0
34 1984 Winter 3 77 74 0.0
35 1988 Summer 3 174 492 28.0
36 1988 Winter 4 94 87 0.0
37 1992 Summer 1 211 511 48.0
38 1992 Winter 4 107 99 5.0
39 1994 Winter 1 100 102 8.0
40 1996 Summer 4 242 449 159.0
41 1998 Winter 3 103 137 8.0
42 2000 Summer 4 239 603 60.0
43 2002 Winter 3 111 151 11.0
44 2004 Summer 3 257 656 8.0
45 2006 Winter 3 110 165 11.0
46 2008 Summer 5 287 597 74.0
47 2010 Winter 3 113 107 67.0
48 2012 Summer 3 242 584 48.0
49 2014 Winter 4 115 169 33.0
50 2016 Summer 3 245 629 36.0
In [216]:
goldcnt_awayhome_df = pysqldf(""" SELECT 
                                        Games, home_team_cnt,away_team_cnt,home_gold_count,away_gold_count,
                                        CAST(home_gold_count AS float) / CAST(home_team_cnt AS float) AS home_avggoldmedal, 
                                        CAST(away_gold_count AS float) / CAST(away_team_cnt AS float) AS away_avggoldmedal 
                                   FROM goldcnt_awayhome_df; """)
goldcnt_awayhome_df
Out[216]:
Games home_team_cnt away_team_cnt home_gold_count away_gold_count home_avggoldmedal away_avggoldmedal
0 1896 Summer 5 13 10.0 52 2.000000 4.000000
1 1900 Summer 122 77 50.0 148 0.409836 1.922078
2 1904 Summer 59 27 128.0 45 2.169492 1.666667
3 1906 Summer 21 33 24.0 133 1.142857 4.030303
4 1908 Summer 34 39 147.0 147 4.323529 3.769231
5 1912 Summer 17 84 103.0 223 6.058824 2.654762
6 1920 Summer 9 63 57.0 436 6.333333 6.920635
7 1924 Summer 5 85 37.0 240 7.400000 2.823529
8 1924 Winter 3 25 0.0 55 0.000000 2.200000
9 1928 Summer 3 64 20.0 225 6.666667 3.515625
10 1928 Winter 3 38 0.0 30 0.000000 0.789474
11 1932 Summer 4 55 81.0 148 20.250000 2.690909
12 1932 Winter 3 26 10.0 22 3.333333 0.846154
13 1936 Summer 5 100 93.0 219 18.600000 2.190000
14 1936 Winter 3 51 4.0 32 1.333333 0.627451
15 1948 Summer 5 106 7.0 282 1.400000 2.660377
16 1948 Winter 3 43 4.0 37 1.333333 0.860465
17 1952 Summer 5 128 8.0 298 1.600000 2.328125
18 1952 Winter 3 49 7.0 38 2.333333 0.775510
19 1956 Summer 6 118 33.0 269 5.500000 2.279661
20 1956 Winter 3 53 2.0 49 0.666667 0.924528
21 1960 Summer 5 181 36.0 273 7.200000 1.508287
22 1960 Winter 4 36 19.0 31 4.750000 0.861111
23 1964 Summer 5 163 32.0 315 6.400000 1.932515
24 1964 Winter 3 63 5.0 56 1.666667 0.888889
25 1968 Summer 1 111 3.0 356 3.000000 3.207207
26 1968 Winter 3 67 4.0 62 1.333333 0.925373
27 1972 Summer 8 131 74.0 330 9.250000 2.519084
28 1972 Winter 3 60 1.0 69 0.333333 1.150000
29 1976 Summer 1 91 0.0 438 0.000000 4.813187
30 1976 Winter 3 67 2.0 68 0.666667 1.014925
31 1980 Summer 1 79 187.0 270 187.000000 3.417722
32 1980 Winter 3 55 24.0 48 8.000000 0.872727
33 1984 Summer 1 139 186.0 311 186.000000 2.237410
34 1984 Winter 3 77 0.0 74 0.000000 0.961039
35 1988 Summer 3 174 28.0 492 9.333333 2.827586
36 1988 Winter 4 94 0.0 87 0.000000 0.925532
37 1992 Summer 1 211 48.0 511 48.000000 2.421801
38 1992 Winter 4 107 5.0 99 1.250000 0.925234
39 1994 Winter 1 100 8.0 102 8.000000 1.020000
40 1996 Summer 4 242 159.0 449 39.750000 1.855372
41 1998 Winter 3 103 8.0 137 2.666667 1.330097
42 2000 Summer 4 239 60.0 603 15.000000 2.523013
43 2002 Winter 3 111 11.0 151 3.666667 1.360360
44 2004 Summer 3 257 8.0 656 2.666667 2.552529
45 2006 Winter 3 110 11.0 165 3.666667 1.500000
46 2008 Summer 5 287 74.0 597 14.800000 2.080139
47 2010 Winter 3 113 67.0 107 22.333333 0.946903
48 2012 Summer 3 242 48.0 584 16.000000 2.413223
49 2014 Winter 4 115 33.0 169 8.250000 1.469565
50 2016 Summer 3 245 36.0 629 12.000000 2.567347
In [ ]:
 
In [80]:
#Count number of  home and away teams' gold medals in each game 
awayhome_gold_df = (pysqldf(""" 
                    SELECT Games,
                           AwayHome, 
                           COUNT(Medal) AS gold_medal_count
                    FROM (  SELECT DISTINCT ID, 
                                    Name, 
                                    Games,
                                    Event,
                                    Sport, 
                                    Team, 
                                    Region, 
                                    Country, 
                                    Medal,
                                    (CASE WHEN Region != Country THEN 'away' 
                                          WHEN Region = Country THEN 'home'
                                          END) AS AwayHome
                            FROM olympics_games_distinct
                            WHERE Medal = 'Gold')
                    GROUP BY Games, AwayHome
                """))
awayhome_gold_df
Out[80]:
Games AwayHome gold_medal_count
0 1896 Summer away 52
1 1896 Summer home 10
2 1900 Summer away 148
3 1900 Summer home 50
4 1904 Summer away 45
5 1904 Summer home 128
6 1906 Summer away 133
7 1906 Summer home 24
8 1908 Summer away 147
9 1908 Summer home 147
10 1912 Summer away 223
11 1912 Summer home 103
12 1920 Summer away 436
13 1920 Summer home 57
14 1924 Summer away 240
15 1924 Summer home 37
16 1924 Winter away 55
17 1928 Summer away 225
18 1928 Summer home 20
19 1928 Winter away 30
20 1932 Summer away 148
21 1932 Summer home 81
22 1932 Winter away 22
23 1932 Winter home 10
24 1936 Summer away 219
25 1936 Summer home 93
26 1936 Winter away 32
27 1936 Winter home 4
28 1948 Summer away 282
29 1948 Summer home 7
30 1948 Winter away 37
31 1948 Winter home 4
32 1952 Summer away 298
33 1952 Summer home 8
34 1952 Winter away 38
35 1952 Winter home 7
36 1956 Summer away 269
37 1956 Summer home 33
38 1956 Winter away 49
39 1956 Winter home 2
40 1960 Summer away 273
41 1960 Summer home 36
42 1960 Winter away 31
43 1960 Winter home 19
44 1964 Summer away 315
45 1964 Summer home 32
46 1964 Winter away 56
47 1964 Winter home 5
48 1968 Summer away 356
49 1968 Summer home 3
50 1968 Winter away 62
51 1968 Winter home 4
52 1972 Summer away 330
53 1972 Summer home 74
54 1972 Winter away 69
55 1972 Winter home 1
56 1976 Summer away 438
57 1976 Winter away 68
58 1976 Winter home 2
59 1980 Summer away 270
60 1980 Summer home 187
61 1980 Winter away 48
62 1980 Winter home 24
63 1984 Summer away 311
64 1984 Summer home 186
65 1984 Winter away 74
66 1988 Summer away 492
67 1988 Summer home 28
68 1988 Winter away 87
69 1992 Summer away 511
70 1992 Summer home 48
71 1992 Winter away 99
72 1992 Winter home 5
73 1994 Winter away 102
74 1994 Winter home 8
75 1996 Summer away 449
76 1996 Summer home 159
77 1998 Winter away 137
78 1998 Winter home 8
79 2000 Summer away 603
80 2000 Summer home 60
81 2002 Winter away 151
82 2002 Winter home 11
83 2004 Summer away 656
84 2004 Summer home 8
85 2006 Winter away 165
86 2006 Winter home 11
87 2008 Summer away 597
88 2008 Summer home 74
89 2010 Winter away 107
90 2010 Winter home 67
91 2012 Summer away 584
92 2012 Summer home 48
93 2014 Winter away 169
94 2014 Winter home 33
95 2016 Summer away 629
96 2016 Summer home 36
In [81]:
#Count number of  home and away teams' silver medals in each game 
display(pysqldf(""" 
                    SELECT Games,
                           AwayHome, 
                           COUNT(Medal) AS silver_medal_count
                    FROM (  SELECT DISTINCT ID, 
                                    Name, 
                                    Games,
                                    Event,
                                    Sport, 
                                    Team, 
                                    Region, 
                                    Country, 
                                    Medal,
                                    (CASE WHEN Region != Country THEN 'away' 
                                          WHEN Region = Country THEN 'home'
                                          END) AS AwayHome
                            FROM olympics_games_distinct
                            WHERE Medal = 'Silver')
                    GROUP BY Games, AwayHome
                """))
Games AwayHome silver_medal_count
0 1896 Summer away 25
1 1896 Summer home 18
2 1900 Summer away 127
3 1900 Summer home 93
4 1904 Summer away 22
5 1904 Summer home 141
6 1906 Summer away 108
7 1906 Summer home 48
8 1908 Summer away 150
9 1908 Summer home 131
10 1912 Summer away 253
11 1912 Summer home 62
12 1920 Summer away 383
13 1920 Summer home 65
14 1924 Summer away 230
15 1924 Summer home 51
16 1924 Winter away 38
17 1928 Summer away 210
18 1928 Summer home 29
19 1928 Winter away 28
20 1932 Summer away 167
21 1932 Summer home 47
22 1932 Winter away 11
23 1932 Winter home 21
24 1936 Summer away 240
25 1936 Summer home 70
26 1936 Winter away 34
27 1936 Winter home 3
28 1948 Summer away 242
29 1948 Summer home 42
30 1948 Winter away 43
31 1948 Winter home 5
32 1952 Summer away 288
33 1952 Summer home 3
34 1952 Winter away 38
35 1952 Winter home 6
36 1956 Summer away 280
37 1956 Summer home 13
38 1956 Winter away 43
39 1956 Winter home 6
40 1960 Summer away 276
41 1960 Summer home 18
42 1960 Winter away 44
43 1960 Winter home 4
44 1964 Summer away 334
45 1964 Summer home 5
46 1964 Winter away 58
47 1964 Winter home 9
48 1968 Summer away 337
49 1968 Summer home 3
50 1968 Winter away 67
51 1968 Winter home 3
52 1972 Summer away 309
53 1972 Summer home 83
54 1972 Winter away 62
55 1972 Winter home 1
56 1976 Summer away 425
57 1976 Summer home 9
58 1976 Winter away 69
59 1976 Winter home 2
60 1980 Summer away 329
61 1980 Summer home 129
62 1980 Winter away 69
63 1980 Winter home 4
64 1984 Summer away 361
65 1984 Summer home 116
66 1984 Winter away 73
67 1984 Winter home 1
68 1988 Summer away 476
69 1988 Summer home 37
70 1988 Winter away 86
71 1988 Winter home 2
72 1992 Summer away 530
73 1992 Summer home 19
74 1992 Winter away 102
75 1992 Winter home 6
76 1994 Winter away 92
77 1994 Winter home 17
78 1996 Summer away 557
79 1996 Summer home 48
80 1998 Winter away 144
81 1998 Winter home 1
82 2000 Summer away 592
83 2000 Summer home 69
84 2002 Winter away 99
85 2002 Winter home 58
86 2004 Summer away 642
87 2004 Summer home 18
88 2006 Winter away 175
89 2008 Summer away 614
90 2008 Summer home 53
91 2010 Winter away 160
92 2010 Winter home 15
93 2012 Summer away 600
94 2012 Summer home 30
95 2014 Winter away 175
96 2014 Winter home 22
97 2016 Summer away 647
98 2016 Summer home 8
In [82]:
#Count number of  home and away teams' bronze medals in each game 
display(pysqldf(""" 
                    SELECT Games,
                           AwayHome, 
                           COUNT(Medal) AS bronze_medal_count
                    FROM (  SELECT DISTINCT ID, 
                                    Name, 
                                    Games,
                                    Event,
                                    Sport, 
                                    Team, 
                                    Region, 
                                    Country, 
                                    Medal,
                                    (CASE WHEN Region != Country THEN 'away' 
                                          WHEN Region = Country THEN 'home'
                                          END) AS AwayHome
                            FROM olympics_games_distinct
                            WHERE Medal = 'Bronze')
                    GROUP BY Games,  AwayHome
                """))
Games AwayHome bronze_medal_count
0 1896 Summer away 18
1 1896 Summer home 20
2 1900 Summer away 93
3 1900 Summer home 82
4 1904 Summer away 25
5 1904 Summer home 125
6 1906 Summer away 115
7 1906 Summer home 30
8 1908 Summer away 166
9 1908 Summer home 90
10 1912 Summer away 275
11 1912 Summer home 25
12 1920 Summer away 301
13 1920 Summer home 66
14 1924 Summer away 252
15 1924 Summer home 22
16 1924 Winter away 27
17 1924 Winter home 10
18 1928 Summer away 242
19 1928 Summer home 8
20 1928 Winter away 19
21 1928 Winter home 12
22 1932 Summer away 143
23 1932 Summer home 61
24 1932 Winter away 25
25 1932 Winter home 3
26 1936 Summer away 234
27 1936 Summer home 61
28 1936 Winter away 35
29 1948 Summer away 267
30 1948 Summer home 12
31 1948 Winter away 27
32 1948 Winter home 19
33 1952 Summer away 271
34 1952 Summer home 29
35 1952 Winter away 41
36 1952 Winter home 6
37 1956 Summer away 272
38 1956 Summer home 26
39 1956 Winter away 50
40 1960 Summer away 274
41 1960 Summer home 34
42 1960 Winter away 45
43 1960 Winter home 4
44 1964 Summer away 318
45 1964 Summer home 25
46 1964 Winter away 55
47 1964 Winter home 3
48 1968 Summer away 355
49 1968 Summer home 3
50 1968 Winter away 61
51 1968 Winter home 2
52 1972 Summer away 323
53 1972 Summer home 96
54 1972 Winter away 65
55 1972 Winter home 1
56 1976 Summer away 434
57 1976 Summer home 14
58 1976 Winter away 67
59 1976 Winter home 3
60 1980 Summer away 343
61 1980 Summer home 126
62 1980 Winter away 71
63 1980 Winter home 2
64 1984 Summer away 452
65 1984 Summer home 50
66 1984 Winter away 74
67 1988 Summer away 537
68 1988 Summer home 12
69 1988 Winter away 84
70 1988 Winter home 4
71 1992 Summer away 602
72 1992 Summer home 2
73 1992 Winter away 105
74 1992 Winter home 1
75 1994 Winter away 107
76 1994 Winter home 5
77 1996 Summer away 577
78 1996 Summer home 52
79 1998 Winter away 146
80 1998 Winter home 4
81 2000 Summer away 626
82 2000 Summer home 54
83 2002 Winter away 144
84 2002 Winter home 15
85 2004 Summer away 672
86 2004 Summer home 5
87 2006 Winter away 161
88 2006 Winter home 14
89 2008 Summer away 653
90 2008 Summer home 57
91 2010 Winter away 163
92 2010 Winter home 8
93 2012 Summer away 631
94 2012 Summer home 48
95 2014 Winter away 185
96 2014 Winter home 13
97 2016 Summer away 697
98 2016 Summer home 6
In [83]:
#Count number of  home and away teams' medals in each game 
goldsilverbronze_count_df = pysqldf(""" 
                    SELECT Games, 
                           AwayHome, 
                           Medal, 
                           COUNT(Medal) AS medal_count
                    FROM (  SELECT DISTINCT ID, 
                                    Name, 
                                    Games,
                                    Event,
                                    Sport, 
                                    Team, 
                                    Region, 
                                    Country, 
                                    Medal,
                                    (CASE WHEN Region != Country THEN 'away' 
                                          WHEN Region = Country THEN 'home'
                                          END) AS AwayHome
                            FROM olympics_games_distinct
                            WHERE Medal IS NOT NULL)
                    GROUP BY Games, AwayHome,Medal
                """)
goldsilverbronze_count_df
Out[83]:
Games AwayHome Medal medal_count
0 1896 Summer away Bronze 18
1 1896 Summer away Gold 52
2 1896 Summer away Silver 25
3 1896 Summer home Bronze 20
4 1896 Summer home Gold 10
... ... ... ... ...
290 2016 Summer away Gold 629
291 2016 Summer away Silver 647
292 2016 Summer home Bronze 6
293 2016 Summer home Gold 36
294 2016 Summer home Silver 8

295 rows × 4 columns

Hypothesis 2 tables¶

Metric 1: Total number of medals¶
In [84]:
# dataframe for athletes 
athletes_df = pysqldf("""SELECT ID, Name, Sex, Age, Height, Weight, Team, Games, Medal, COUNT(Medal) AS medal_count 
                            FROM olympics_games_distinct
                            GROUP BY ID, Name, Sex, Age, Height, Weight, Team, Games, Medal
                        """)
athletes_df
Out[84]:
ID Name Sex Age Height Weight Team Games Medal medal_count
0 1 A Dijiang M 24.0 180.0 80.0 China 1992 Summer None 0
1 2 A Lamusi M 23.0 170.0 60.0 China 2012 Summer None 0
2 3 Gunnar Nielsen Aaby M 24.0 NaN NaN Denmark 1920 Summer None 0
3 4 Edgar Lindenau Aabye M 34.0 NaN NaN Denmark/Sweden 1900 Summer Gold 1
4 5 Christine Jacoba Aaftink F 21.0 185.0 82.0 Netherlands 1988 Winter None 0
... ... ... ... ... ... ... ... ... ... ...
199822 135568 Olga Igorevna Zyuzkova F 33.0 171.0 69.0 Belarus 2016 Summer None 0
199823 135569 Andrzej ya M 29.0 179.0 89.0 Poland-1 1976 Winter None 0
199824 135570 Piotr ya M 27.0 176.0 59.0 Poland 2014 Winter None 0
199825 135571 Tomasz Ireneusz ya M 30.0 185.0 96.0 Poland 1998 Winter None 0
199826 135571 Tomasz Ireneusz ya M 34.0 185.0 96.0 Poland 2002 Winter None 0

199827 rows × 10 columns

In [85]:
# athlete who participated the most in different olympics games 
display(pysqldf("""SELECT *
                   FROM athletes_df
                   WHERE Name = 'Ian Millar'"""))
ID Name Sex Age Height Weight Team Games Medal medal_count
0 79855 Ian Millar M 25.0 185.0 76.0 Canada 1972 Summer None 0
1 79855 Ian Millar M 29.0 185.0 76.0 Canada 1976 Summer None 0
2 79855 Ian Millar M 37.0 185.0 76.0 Canada 1984 Summer None 0
3 79855 Ian Millar M 41.0 185.0 76.0 Canada 1988 Summer None 0
4 79855 Ian Millar M 45.0 185.0 76.0 Canada 1992 Summer None 0
5 79855 Ian Millar M 49.0 185.0 76.0 Canada 1996 Summer None 0
6 79855 Ian Millar M 53.0 185.0 76.0 Canada 2000 Summer None 0
7 79855 Ian Millar M 57.0 185.0 76.0 Canada 2004 Summer None 0
8 79855 Ian Millar M 61.0 185.0 76.0 Canada 2008 Summer None 0
9 79855 Ian Millar M 61.0 185.0 76.0 Canada 2008 Summer Silver 1
10 79855 Ian Millar M 65.0 185.0 76.0 Canada 2012 Summer None 0
In [86]:
#athlete with the highest number of medals 
athlete_medals_df = (pysqldf("""SELECT ID, Name, Sex, AVG(Age) AS avg_age, AVG(Height) AS avg_height, AVG(Weight) AS avg_weight,  SUM(medal_count) AS total_medal 
                   FROM athletes_df
                   GROUP BY  ID, Name
                   ORDER BY total_medal DESC"""))
athlete_medals_df
Out[86]:
ID Name Sex avg_age avg_height avg_weight total_medal
0 94406 Michael Fred Phelps, II M 24.333333 193.0 91.0 28
1 67046 Larysa Semenivna Latynina (Diriy-) F 24.600000 161.0 52.0 18
2 4198 Nikolay Yefimovich Andrianov M 23.000000 166.0 60.0 15
3 11951 Ole Einar Bjrndalen M 30.857143 178.0 65.0 13
4 74420 Edoardo Mangiarotti M 33.400000 NaN NaN 13
... ... ... ... ... ... ... ...
135566 135567 Aleksandr Viktorovich Zyuzin M 26.000000 183.0 72.0 0
135567 135568 Olga Igorevna Zyuzkova F 33.000000 171.0 69.0 0
135568 135569 Andrzej ya M 29.000000 179.0 89.0 0
135569 135570 Piotr ya M 27.000000 176.0 59.0 0
135570 135571 Tomasz Ireneusz ya M 32.000000 185.0 96.0 0

135571 rows × 7 columns

In [87]:
# count null values in avg age, height, and weight 
display(pysqldf("""SELECT
                        COUNT(*)-COUNT(Name) As Name, 
                        COUNT(*)-COUNT(Sex) As Sex, 
                        COUNT(*)-COUNT(avg_age) As avg_age, 
                        COUNT(*)-COUNT(avg_height) As avg_height, 
                        COUNT(*)-COUNT(avg_weight) As avg_weight, 
                        COUNT(*)-COUNT(total_medal) As total_medal
                    FROM athlete_medals_df;  """))
Name Sex avg_age avg_height avg_weight total_medal
0 0 0 6368 33916 35281 0
In [88]:
# Replace null values in each column with average 
athlete_medals_df = athlete_medals_df.fillna(athlete_medals_df[['avg_age','avg_height','avg_weight']].mean())
In [89]:
# recheck number of null values 
display(pysqldf("""SELECT
                        COUNT(*)-COUNT(Name) As Name, 
                        COUNT(*)-COUNT(Sex) As Sex, 
                        COUNT(*)-COUNT(avg_age) As avg_age, 
                        COUNT(*)-COUNT(avg_height) As avg_height, 
                        COUNT(*)-COUNT(avg_weight) As avg_weight, 
                        COUNT(*)-COUNT(total_medal) As total_medal
                    FROM athlete_medals_df;  """))
Name Sex avg_age avg_height avg_weight total_medal
0 0 0 0 0 0 0
Metric 2: Total number of gold medals¶
In [223]:
athletes_gold_df = pysqldf(""" SELECT ID, Name, Sex, AVG(Age) AS avg_age, AVG(Height) AS avg_height, AVG(Weight) AS avg_weight,  SUM(goldmedal_count) AS total_goldmedal 
                           FROM(   
                                SELECT ID, Name, Sex, Age, Height, Weight, Team, Games, Medal, COUNT(Medal) AS goldmedal_count 
                                FROM olympics_games_distinct
                                WHERE Medal= "Gold"
                                GROUP BY ID, Name, Sex, Age, Height, Weight, Team, Games, Medal) 
                           GROUP BY  ID, Name
                           ORDER BY total_goldmedal DESC
                        """)
athletes_gold_df
Out[223]:
ID Name Sex avg_age avg_height avg_weight total_goldmedal
0 94406 Michael Fred Phelps, II M 25.0 193.0 91.0 23
1 33557 Raymond Clarence "Ray" Ewry M 30.5 185.0 79.0 10
2 67046 Larysa Semenivna Latynina (Diriy-) F 25.0 161.0 52.0 9
3 69210 Frederick Carlton "Carl" Lewis M 29.0 188.0 80.0 9
4 87390 Paavo Johannes Nurmi M 26.0 174.0 65.0 9
... ... ... ... ... ... ... ...
10420 135503 Zurab Zviadauri M 23.0 182.0 90.0 1
10421 135520 Julia Zwehl F 28.0 167.0 60.0 1
10422 135523 Ronald Ferdinand "Ron" Zwerver M 29.0 200.0 93.0 1
10423 135545 Henk Jan Zwolle M 31.0 197.0 93.0 1
10424 135553 Galina Ivanovna Zybina (-Fyodorova) F 21.0 168.0 80.0 1

10425 rows × 7 columns

In [224]:
# Replace null values in each column with average 
athletes_gold_df = athletes_gold_df.fillna(athletes_gold_df[['avg_age','avg_height','avg_weight']].mean())

Hypothesis 3 tables¶

Metric 1: Total medals¶
In [90]:
display(pysqldf(""" SELECT Sex, Sport, COUNT(Medal) AS medal_count
                    FROM olympics_games_distinct
                    GROUP BY Sport, Sex;"""))
Sex Sport medal_count
0 M Aeronautics 1
1 F Alpine Skiing 213
2 M Alpine Skiing 215
3 F Alpinism 1
4 M Alpinism 24
... ... ... ...
111 M Water Polo 866
112 F Weightlifting 105
113 M Weightlifting 541
114 F Wrestling 68
115 M Wrestling 1228

116 rows × 3 columns

In [91]:
# count of male and female pariticpants in each sport 
display(pysqldf(""" SELECT Sport,
                            COUNT(CASE WHEN Sex = 'M' Then 1 END) AS male_cnt, 
                            COUNT(CASE WHEN Sex = 'F' Then 1 END) AS female_cnt
                    FROM olympics_games_distinct
                    GROUP BY Sport; 
                    """))
Sport male_cnt female_cnt
0 Aeronautics 1 0
1 Alpine Skiing 5431 3398
2 Alpinism 24 1
3 Archery 1319 1015
4 Art Competitions 2030 233
5 Athletics 26958 11666
6 Badminton 717 740
7 Baseball 894 0
8 Basketball 3280 1256
9 Basque Pelota 2 0
10 Beach Volleyball 288 276
11 Biathlon 3030 1863
12 Bobsleigh 2915 143
13 Boxing 5975 72
14 Canoeing 4791 1380
15 Cricket 24 0
16 Croquet 13 6
17 Cross Country Skiing 5748 3385
18 Curling 241 222
19 Cycling 9433 1394
20 Diving 1632 1210
21 Equestrianism 5097 1246
22 Fencing 8735 2000
23 Figure Skating 1126 1172
24 Football 5733 1012
25 Freestyle Skiing 504 433
26 Golf 177 70
27 Gymnastics 17578 9129
28 Handball 2264 1401
29 Hockey 3958 1459
30 Ice Hockey 4762 754
31 Jeu De Paume 11 0
32 Judo 2708 1093
33 Lacrosse 60 0
34 Luge 1102 377
35 Military Ski Patrol 24 0
36 Modern Pentathlon 1513 164
37 Motorboating 16 1
38 Nordic Combined 1344 0
39 Polo 95 0
40 Racquets 12 0
41 Rhythmic Gymnastics 0 658
42 Roque 4 0
43 Rowing 8402 2193
44 Rugby 162 0
45 Rugby Sevens 151 148
46 Sailing 5623 926
47 Shooting 9724 1724
48 Short Track Speed Skating 773 761
49 Skeleton 133 66
50 Ski Jumping 2371 30
51 Snowboarding 520 416
52 Softball 0 478
53 Speed Skating 3532 2081
54 Swimming 13345 9850
55 Synchronized Swimming 0 909
56 Table Tennis 1002 953
57 Taekwondo 307 299
58 Tennis 1684 1178
59 Trampolining 76 76
60 Triathlon 266 263
61 Tug-Of-War 170 0
62 Volleyball 1861 1543
63 Water Polo 3358 488
64 Weightlifting 3474 463
65 Wrestling 6850 304
In [92]:
#sports where only one gender participate in 
display(pysqldf(""" SELECT  * 
                    FROM(SELECT Sport,
                                COUNT(CASE WHEN Sex = 'M' Then 1 END) AS male_cnt, 
                                COUNT(CASE WHEN Sex = 'F' Then 1 END) AS female_cnt
                        FROM olympics_games_distinct
                        GROUP BY Sport)
                    WHERE male_cnt = 0 
                    """))
Sport male_cnt female_cnt
0 Rhythmic Gymnastics 0 658
1 Softball 0 478
2 Synchronized Swimming 0 909
In [93]:
#sports where only one gender participate in 
display(pysqldf("""SELECT *
                   FROM(
                        SELECT Sport, Sex, COUNT(DISTINCT Sex) as gender_count
                        FROM olympics_games_distinct
                        GROUP BY Sport)
                    WHERE gender_count = 1; 
                    """))
Sport Sex gender_count
0 Aeronautics M 1
1 Baseball M 1
2 Basque Pelota M 1
3 Cricket M 1
4 Jeu De Paume M 1
5 Lacrosse M 1
6 Military Ski Patrol M 1
7 Nordic Combined M 1
8 Polo M 1
9 Racquets M 1
10 Rhythmic Gymnastics F 1
11 Roque M 1
12 Rugby M 1
13 Softball F 1
14 Synchronized Swimming F 1
15 Tug-Of-War M 1
In [94]:
#sports where only female participate in 
display(pysqldf("""SELECT *
                   FROM(
                        SELECT Sport, Sex, COUNT(DISTINCT Sex) as gender_count
                        FROM olympics_games_distinct
                        GROUP BY Sport)
                    WHERE gender_count = 1 AND Sex = 'F'; 
                    """))
Sport Sex gender_count
0 Rhythmic Gymnastics F 1
1 Softball F 1
2 Synchronized Swimming F 1
In [95]:
#sports where only male participate in 
display(pysqldf("""SELECT *
                   FROM(
                        SELECT Sport, Sex, COUNT(DISTINCT Sex) as dist_gender_count
                        FROM olympics_games_distinct
                        GROUP BY Sport)
                    WHERE dist_gender_count = 1 AND Sex = 'M'; 
                    """))
Sport Sex dist_gender_count
0 Aeronautics M 1
1 Baseball M 1
2 Basque Pelota M 1
3 Cricket M 1
4 Jeu De Paume M 1
5 Lacrosse M 1
6 Military Ski Patrol M 1
7 Nordic Combined M 1
8 Polo M 1
9 Racquets M 1
10 Roque M 1
11 Rugby M 1
12 Tug-Of-War M 1
In [96]:
#sports where both gender participate 
mixedsex_sport_df = (pysqldf(""" SELECT *
                    FROM(
                        SELECT Sex, Sport, 
                               COUNT(Medal) AS medal_count
                        FROM olympics_games_distinct
                        GROUP BY Sport, Sex)
                    WHERE Sport NOT IN(SELECT Sport 
                                       FROM(
                                            SELECT Sport, Sex, COUNT(DISTINCT Sex) as gender_count
                                            FROM olympics_games_distinct
                                            GROUP BY Sport)
                                        WHERE gender_count = 1)
                        ;"""))
mixedsex_sport_df
Out[96]:
Sex Sport medal_count
0 F Alpine Skiing 213
1 M Alpine Skiing 215
2 F Alpinism 1
3 M Alpinism 24
4 F Archery 121
5 M Archery 232
6 F Art Competitions 11
7 M Art Competitions 145
8 F Athletics 1275
9 M Athletics 2694
10 F Badminton 84
11 M Badminton 84
12 F Basketball 393
13 M Basketball 687
14 F Beach Volleyball 36
15 M Beach Volleyball 36
16 F Biathlon 150
17 M Biathlon 258
18 F Bobsleigh 24
19 M Bobsleigh 374
20 F Boxing 24
21 M Boxing 920
22 F Canoeing 282
23 M Canoeing 883
24 F Croquet 0
25 M Croquet 8
26 F Cross Country Skiing 339
27 M Cross Country Skiing 437
28 F Curling 68
29 M Curling 84
30 F Cycling 176
31 M Cycling 1087
32 F Diving 201
33 M Diving 226
34 F Equestrianism 207
35 M Equestrianism 758
36 F Fencing 349
37 M Fencing 1394
38 F Figure Skating 191
39 M Figure Skating 195
40 F Football 302
41 M Football 1269
42 F Freestyle Skiing 51
43 M Freestyle Skiing 51
44 F Golf 6
45 M Golf 40
46 F Gymnastics 701
47 M Gymnastics 1555
48 F Handball 472
49 M Handball 588
50 F Hockey 478
51 M Hockey 1050
52 F Ice Hockey 300
53 M Ice Hockey 1230
54 F Judo 196
55 M Judo 351
56 F Luge 45
57 M Luge 135
58 F Modern Pentathlon 15
59 M Modern Pentathlon 171
60 F Motorboating 0
61 M Motorboating 7
62 F Rowing 720
63 M Rowing 2225
64 F Rugby Sevens 36
65 M Rugby Sevens 38
66 F Sailing 134
67 M Sailing 1087
68 F Shooting 146
69 M Shooting 1082
70 F Short Track Speed Skating 140
71 M Short Track Speed Skating 144
72 F Skeleton 12
73 M Skeleton 18
74 F Ski Jumping 3
75 M Ski Jumping 204
76 F Snowboarding 45
77 M Snowboarding 45
78 F Speed Skating 242
79 M Speed Skating 338
80 F Swimming 1374
81 M Swimming 1674
82 F Table Tennis 84
83 M Table Tennis 84
84 F Taekwondo 72
85 M Taekwondo 72
86 F Tennis 140
87 M Tennis 200
88 F Trampolining 15
89 M Trampolining 15
90 F Triathlon 15
91 M Triathlon 15
92 F Volleyball 474
93 M Volleyball 495
94 F Water Polo 191
95 M Water Polo 866
96 F Weightlifting 105
97 M Weightlifting 541
98 F Wrestling 68
99 M Wrestling 1228
In [97]:
display(pysqldf("""   SELECT *
                      FROM(
                          SELECT Sport, medal_count AS female_medal_cnt 
                          FROM mixedsex_sport_df
                          WHERE Sex = 'F')
                      LEFT JOIN ( SELECT Sport, medal_count AS male_medal_cnt 
                                  FROM mixedsex_sport_df
                                  WHERE Sex = 'M')
                      USING(Sport)"""))
Sport female_medal_cnt male_medal_cnt
0 Alpine Skiing 213 215
1 Alpinism 1 24
2 Archery 121 232
3 Art Competitions 11 145
4 Athletics 1275 2694
5 Badminton 84 84
6 Basketball 393 687
7 Beach Volleyball 36 36
8 Biathlon 150 258
9 Bobsleigh 24 374
10 Boxing 24 920
11 Canoeing 282 883
12 Croquet 0 8
13 Cross Country Skiing 339 437
14 Curling 68 84
15 Cycling 176 1087
16 Diving 201 226
17 Equestrianism 207 758
18 Fencing 349 1394
19 Figure Skating 191 195
20 Football 302 1269
21 Freestyle Skiing 51 51
22 Golf 6 40
23 Gymnastics 701 1555
24 Handball 472 588
25 Hockey 478 1050
26 Ice Hockey 300 1230
27 Judo 196 351
28 Luge 45 135
29 Modern Pentathlon 15 171
30 Motorboating 0 7
31 Rowing 720 2225
32 Rugby Sevens 36 38
33 Sailing 134 1087
34 Shooting 146 1082
35 Short Track Speed Skating 140 144
36 Skeleton 12 18
37 Ski Jumping 3 204
38 Snowboarding 45 45
39 Speed Skating 242 338
40 Swimming 1374 1674
41 Table Tennis 84 84
42 Taekwondo 72 72
43 Tennis 140 200
44 Trampolining 15 15
45 Triathlon 15 15
46 Volleyball 474 495
47 Water Polo 191 866
48 Weightlifting 105 541
49 Wrestling 68 1228
In [98]:
#Count number of  male and female athletes in each sport 
sportgender_count_df = pysqldf(""" 
                    SELECT Sport, 
                           COUNT(CASE WHEN Sex ='F' THEN 1 END) AS female_ath_count,  
                           COUNT(CASE WHEN Sex ='M' THEN 1 END) AS male_ath_count
                    FROM ( SELECT DISTINCT ID, Sex, Sport
                           FROM olympics_games_distinct)
                    GROUP BY Sport
                """)
sportgender_count_df
Out[98]:
Sport female_ath_count male_ath_count
0 Aeronautics 0 1
1 Alpine Skiing 996 1739
2 Alpinism 1 24
3 Archery 500 613
4 Art Competitions 204 1610
5 Athletics 6529 15542
6 Badminton 412 399
7 Baseball 0 761
8 Basketball 932 2481
9 Basque Pelota 0 2
10 Beach Volleyball 189 194
11 Biathlon 371 764
12 Bobsleigh 109 1585
13 Boxing 65 5197
14 Canoeing 702 2504
15 Cricket 0 24
16 Croquet 3 7
17 Cross Country Skiing 717 1683
18 Curling 160 186
19 Cycling 714 5105
20 Diving 635 831
21 Equestrianism 459 1886
22 Fencing 880 3243
23 Figure Skating 824 748
24 Football 734 5427
25 Freestyle Skiing 267 359
26 Golf 70 148
27 Gymnastics 1499 2635
28 Handball 1027 1675
29 Hockey 996 2829
30 Ice Hockey 498 3386
31 Jeu De Paume 0 11
32 Judo 757 1967
33 Lacrosse 0 60
34 Luge 228 544
35 Military Ski Patrol 0 24
36 Modern Pentathlon 114 750
37 Motorboating 1 13
38 Nordic Combined 0 605
39 Polo 0 87
40 Racquets 0 7
41 Rhythmic Gymnastics 567 0
42 Roque 0 4
43 Rowing 1483 6204
44 Rugby 0 155
45 Rugby Sevens 148 151
46 Sailing 629 3851
47 Shooting 737 4145
48 Short Track Speed Skating 209 235
49 Skeleton 45 101
50 Ski Jumping 30 844
51 Snowboarding 239 328
52 Softball 367 0
53 Speed Skating 528 1054
54 Swimming 3621 5144
55 Synchronized Swimming 550 0
56 Table Tennis 377 372
57 Taekwondo 229 241
58 Tennis 486 760
59 Trampolining 44 49
60 Triathlon 175 180
61 Tug-Of-War 0 160
62 Volleyball 1129 1374
63 Water Polo 337 2262
64 Weightlifting 356 2526
65 Wrestling 222 4766
In [99]:
#only 1 aeronautic record? 
display(pysqldf(""" SELECT *
                            FROM olympics_games_distinct
                            where Sport = 'Aeronautics'  """))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
0 107506 Hermann Schreiber M 26.0 None None Switzerland SUI Switzerland 1936 Summer 1936 Summer Aeronautics Aeronautics Mixed Aeronautics Berlin Germany Gold None
In [100]:
# verify result above 
display(pysqldf(""" SELECT *
                            FROM olympics_games
                            where Sport = 'Basque Pelota' """))
ID Name Sex Age Height Weight Team NOC Region Games Year Season Sport Event City Country Medal Notes
0 25866 Jos de Amzola y Aspiza M 26 None None Spain ESP Spain 1900 Summer 1900 Summer Basque Pelota Basque Pelota Men's Two-Man Teams With Cesta Paris France Gold None
1 126675 Francisco Villota y Baquiola M 26 None None Spain ESP Spain 1900 Summer 1900 Summer Basque Pelota Basque Pelota Men's Two-Man Teams With Cesta Paris France Gold None
In [101]:
# exlcude sports where only one gender participate in 
mixedsport_athcount_df = pysqldf(""" SELECT *
                                     FROM sportgender_count_df
                                     WHERE female_ath_count > 0 AND male_ath_count > 0""")
mixedsport_athcount_df
Out[101]:
Sport female_ath_count male_ath_count
0 Alpine Skiing 996 1739
1 Alpinism 1 24
2 Archery 500 613
3 Art Competitions 204 1610
4 Athletics 6529 15542
5 Badminton 412 399
6 Basketball 932 2481
7 Beach Volleyball 189 194
8 Biathlon 371 764
9 Bobsleigh 109 1585
10 Boxing 65 5197
11 Canoeing 702 2504
12 Croquet 3 7
13 Cross Country Skiing 717 1683
14 Curling 160 186
15 Cycling 714 5105
16 Diving 635 831
17 Equestrianism 459 1886
18 Fencing 880 3243
19 Figure Skating 824 748
20 Football 734 5427
21 Freestyle Skiing 267 359
22 Golf 70 148
23 Gymnastics 1499 2635
24 Handball 1027 1675
25 Hockey 996 2829
26 Ice Hockey 498 3386
27 Judo 757 1967
28 Luge 228 544
29 Modern Pentathlon 114 750
30 Motorboating 1 13
31 Rowing 1483 6204
32 Rugby Sevens 148 151
33 Sailing 629 3851
34 Shooting 737 4145
35 Short Track Speed Skating 209 235
36 Skeleton 45 101
37 Ski Jumping 30 844
38 Snowboarding 239 328
39 Speed Skating 528 1054
40 Swimming 3621 5144
41 Table Tennis 377 372
42 Taekwondo 229 241
43 Tennis 486 760
44 Trampolining 44 49
45 Triathlon 175 180
46 Volleyball 1129 1374
47 Water Polo 337 2262
48 Weightlifting 356 2526
49 Wrestling 222 4766
In [102]:
# medal and athlete count  for each sport where both genders participate in 
sportgender_medal_df = pysqldf("""    SELECT * 
                                      FROM mixedsport_athcount_df 
                                      LEFT JOIN ( SELECT *
                                                  FROM(
                                                      SELECT Sport, medal_count AS female_medal_cnt 
                                                      FROM mixedsex_sport_df
                                                      WHERE Sex = 'F')
                                                  LEFT JOIN ( SELECT Sport, medal_count AS male_medal_cnt 
                                                              FROM mixedsex_sport_df
                                                              WHERE Sex = 'M')
                                                  USING(Sport))
                                      USING(Sport) """)
sportgender_medal_df
Out[102]:
Sport female_ath_count male_ath_count female_medal_cnt male_medal_cnt
0 Alpine Skiing 996 1739 213 215
1 Alpinism 1 24 1 24
2 Archery 500 613 121 232
3 Art Competitions 204 1610 11 145
4 Athletics 6529 15542 1275 2694
5 Badminton 412 399 84 84
6 Basketball 932 2481 393 687
7 Beach Volleyball 189 194 36 36
8 Biathlon 371 764 150 258
9 Bobsleigh 109 1585 24 374
10 Boxing 65 5197 24 920
11 Canoeing 702 2504 282 883
12 Croquet 3 7 0 8
13 Cross Country Skiing 717 1683 339 437
14 Curling 160 186 68 84
15 Cycling 714 5105 176 1087
16 Diving 635 831 201 226
17 Equestrianism 459 1886 207 758
18 Fencing 880 3243 349 1394
19 Figure Skating 824 748 191 195
20 Football 734 5427 302 1269
21 Freestyle Skiing 267 359 51 51
22 Golf 70 148 6 40
23 Gymnastics 1499 2635 701 1555
24 Handball 1027 1675 472 588
25 Hockey 996 2829 478 1050
26 Ice Hockey 498 3386 300 1230
27 Judo 757 1967 196 351
28 Luge 228 544 45 135
29 Modern Pentathlon 114 750 15 171
30 Motorboating 1 13 0 7
31 Rowing 1483 6204 720 2225
32 Rugby Sevens 148 151 36 38
33 Sailing 629 3851 134 1087
34 Shooting 737 4145 146 1082
35 Short Track Speed Skating 209 235 140 144
36 Skeleton 45 101 12 18
37 Ski Jumping 30 844 3 204
38 Snowboarding 239 328 45 45
39 Speed Skating 528 1054 242 338
40 Swimming 3621 5144 1374 1674
41 Table Tennis 377 372 84 84
42 Taekwondo 229 241 72 72
43 Tennis 486 760 140 200
44 Trampolining 44 49 15 15
45 Triathlon 175 180 15 15
46 Volleyball 1129 1374 474 495
47 Water Polo 337 2262 191 866
48 Weightlifting 356 2526 105 541
49 Wrestling 222 4766 68 1228
In [103]:
# medal count for male and female athletes for each sport where both genders participate in 
#sportgendermedal_df = pysqldf("""  WITH obs_count AS(
#                                                         SELECT 
#                                                            Sport, 
#                                                            COUNT(CASE WHEN Sex = 'F' THEN 1 END) AS femaleobs_count, 
#                                                            COUNT(CASE WHEN Sex = 'M' THEN 1 END) AS maleobs_count 
#                                                        FROM olympics_games_distdf 
#                                                        GROUP BY  Sport ) 
#                                    SELECT * 
#                                    FROM obs_count
#                                    LEFT JOIN gender_sportmedal_df
#                                    USING(Sport)
 #                                   WHERE femaleobs_count > 0 AND maleobs_count > 0 
#                                """)
#sportgendermedal_df
In [104]:
sportgendermedal_df = pysqldf(""" SELECT 
                                        Sport,female_ath_count,male_ath_count,female_medal_cnt,male_medal_cnt,
                                        CAST(female_medal_cnt AS float) / CAST(female_ath_count AS float) AS female_avgmedal, 
                                        CAST(male_medal_cnt AS float) / CAST(male_ath_count AS float) AS male_avgmedal 
                                   FROM sportgender_medal_df; """)
sportgendermedal_df
Out[104]:
Sport female_ath_count male_ath_count female_medal_cnt male_medal_cnt female_avgmedal male_avgmedal
0 Alpine Skiing 996 1739 213 215 0.213855 0.123634
1 Alpinism 1 24 1 24 1.000000 1.000000
2 Archery 500 613 121 232 0.242000 0.378467
3 Art Competitions 204 1610 11 145 0.053922 0.090062
4 Athletics 6529 15542 1275 2694 0.195283 0.173337
5 Badminton 412 399 84 84 0.203883 0.210526
6 Basketball 932 2481 393 687 0.421674 0.276904
7 Beach Volleyball 189 194 36 36 0.190476 0.185567
8 Biathlon 371 764 150 258 0.404313 0.337696
9 Bobsleigh 109 1585 24 374 0.220183 0.235962
10 Boxing 65 5197 24 920 0.369231 0.177025
11 Canoeing 702 2504 282 883 0.401709 0.352636
12 Croquet 3 7 0 8 0.000000 1.142857
13 Cross Country Skiing 717 1683 339 437 0.472803 0.259655
14 Curling 160 186 68 84 0.425000 0.451613
15 Cycling 714 5105 176 1087 0.246499 0.212929
16 Diving 635 831 201 226 0.316535 0.271961
17 Equestrianism 459 1886 207 758 0.450980 0.401909
18 Fencing 880 3243 349 1394 0.396591 0.429849
19 Figure Skating 824 748 191 195 0.231796 0.260695
20 Football 734 5427 302 1269 0.411444 0.233831
21 Freestyle Skiing 267 359 51 51 0.191011 0.142061
22 Golf 70 148 6 40 0.085714 0.270270
23 Gymnastics 1499 2635 701 1555 0.467645 0.590133
24 Handball 1027 1675 472 588 0.459591 0.351045
25 Hockey 996 2829 478 1050 0.479920 0.371156
26 Ice Hockey 498 3386 300 1230 0.602410 0.363260
27 Judo 757 1967 196 351 0.258917 0.178444
28 Luge 228 544 45 135 0.197368 0.248162
29 Modern Pentathlon 114 750 15 171 0.131579 0.228000
30 Motorboating 1 13 0 7 0.000000 0.538462
31 Rowing 1483 6204 720 2225 0.485502 0.358640
32 Rugby Sevens 148 151 36 38 0.243243 0.251656
33 Sailing 629 3851 134 1087 0.213037 0.282264
34 Shooting 737 4145 146 1082 0.198100 0.261037
35 Short Track Speed Skating 209 235 140 144 0.669856 0.612766
36 Skeleton 45 101 12 18 0.266667 0.178218
37 Ski Jumping 30 844 3 204 0.100000 0.241706
38 Snowboarding 239 328 45 45 0.188285 0.137195
39 Speed Skating 528 1054 242 338 0.458333 0.320683
40 Swimming 3621 5144 1374 1674 0.379453 0.325428
41 Table Tennis 377 372 84 84 0.222812 0.225806
42 Taekwondo 229 241 72 72 0.314410 0.298755
43 Tennis 486 760 140 200 0.288066 0.263158
44 Trampolining 44 49 15 15 0.340909 0.306122
45 Triathlon 175 180 15 15 0.085714 0.083333
46 Volleyball 1129 1374 474 495 0.419841 0.360262
47 Water Polo 337 2262 191 866 0.566766 0.382847
48 Weightlifting 356 2526 105 541 0.294944 0.214173
49 Wrestling 222 4766 68 1228 0.306306 0.257658
Metric 2: Total gold medals¶
In [227]:
gendergold_sport_df = (pysqldf("""  SELECT *
                                    FROM(
                                        SELECT Sex, Sport, 
                                               COUNT(Medal) AS goldmedal_count
                                        FROM olympics_games_distinct
                                        WHERE Medal = 'Gold'
                                        GROUP BY Sport, Sex)
                                    WHERE Sport NOT IN(SELECT Sport 
                                                       FROM(
                                                            SELECT Sport, Sex, COUNT(DISTINCT Sex) as gender_count
                                                            FROM olympics_games_distinct
                                                            GROUP BY Sport)
                                                        WHERE gender_count = 1) ;
                                """))
gendergold_sport_df
Out[227]:
Sex Sport goldmedal_count
0 F Alpine Skiing 72
1 M Alpine Skiing 71
2 F Alpinism 1
3 M Alpinism 24
4 F Archery 43
5 M Archery 92
6 F Art Competitions 1
7 M Art Competitions 48
8 F Athletics 428
9 M Athletics 911
10 F Badminton 27
11 M Badminton 27
12 F Basketball 131
13 M Basketball 234
14 F Beach Volleyball 12
15 M Beach Volleyball 12
16 F Biathlon 50
17 M Biathlon 86
18 F Bobsleigh 8
19 M Bobsleigh 125
20 F Boxing 6
21 M Boxing 246
22 F Canoeing 94
23 M Canoeing 294
24 M Croquet 4
25 F Cross Country Skiing 113
26 M Cross Country Skiing 143
27 F Curling 22
28 M Curling 28
29 F Cycling 58
30 M Cycling 366
31 F Diving 67
32 M Diving 75
33 F Equestrianism 55
34 M Equestrianism 269
35 F Fencing 119
36 M Fencing 475
37 F Figure Skating 65
38 M Figure Skating 66
39 F Football 101
40 M Football 414
41 F Freestyle Skiing 17
42 M Freestyle Skiing 17
43 F Golf 2
44 M Golf 13
45 F Gymnastics 234
46 M Gymnastics 557
47 F Handball 155
48 M Handball 194
49 F Hockey 158
50 M Hockey 360
51 F Ice Hockey 101
52 M Ice Hockey 407
53 F Judo 49
54 M Judo 88
55 F Luge 15
56 M Luge 47
57 F Modern Pentathlon 5
58 M Modern Pentathlon 57
59 M Motorboating 7
60 F Rowing 238
61 M Rowing 740
62 F Rugby Sevens 12
63 M Rugby Sevens 13
64 F Sailing 47
65 M Sailing 400
66 F Shooting 49
67 M Shooting 361
68 F Short Track Speed Skating 47
69 M Short Track Speed Skating 48
70 F Skeleton 4
71 M Skeleton 6
72 F Ski Jumping 1
73 M Ski Jumping 68
74 F Snowboarding 15
75 M Snowboarding 15
76 F Speed Skating 81
77 M Speed Skating 112
78 F Swimming 493
79 M Swimming 606
80 F Table Tennis 27
81 M Table Tennis 27
82 F Taekwondo 20
83 M Taekwondo 20
84 F Tennis 44
85 M Tennis 62
86 F Trampolining 5
87 M Trampolining 5
88 F Triathlon 5
89 M Triathlon 5
90 F Volleyball 156
91 M Volleyball 166
92 F Water Polo 63
93 M Water Polo 287
94 F Weightlifting 35
95 M Weightlifting 182
96 F Wrestling 18
97 M Wrestling 395
In [229]:
# medal and athlete count  for each sport where both genders participate in 
sportgender_goldmedal_df = pysqldf("""    SELECT * 
                                      FROM mixedsport_athcount_df 
                                      LEFT JOIN ( SELECT *
                                                  FROM(
                                                      SELECT Sport, goldmedal_count AS male_goldmedal_cnt 
                                                      FROM gendergold_sport_df
                                                      WHERE Sex = 'M')
                                                  LEFT JOIN ( SELECT Sport, goldmedal_count AS female_goldmedal_cnt 
                                                              FROM gendergold_sport_df
                                                              WHERE Sex = 'F')
                                                  USING(Sport))
                                      USING(Sport) """)
sportgender_goldmedal_df
Out[229]:
Sport female_ath_count male_ath_count male_goldmedal_cnt female_goldmedal_cnt
0 Alpine Skiing 996 1739 71 72.0
1 Alpinism 1 24 24 1.0
2 Archery 500 613 92 43.0
3 Art Competitions 204 1610 48 1.0
4 Athletics 6529 15542 911 428.0
5 Badminton 412 399 27 27.0
6 Basketball 932 2481 234 131.0
7 Beach Volleyball 189 194 12 12.0
8 Biathlon 371 764 86 50.0
9 Bobsleigh 109 1585 125 8.0
10 Boxing 65 5197 246 6.0
11 Canoeing 702 2504 294 94.0
12 Croquet 3 7 4 NaN
13 Cross Country Skiing 717 1683 143 113.0
14 Curling 160 186 28 22.0
15 Cycling 714 5105 366 58.0
16 Diving 635 831 75 67.0
17 Equestrianism 459 1886 269 55.0
18 Fencing 880 3243 475 119.0
19 Figure Skating 824 748 66 65.0
20 Football 734 5427 414 101.0
21 Freestyle Skiing 267 359 17 17.0
22 Golf 70 148 13 2.0
23 Gymnastics 1499 2635 557 234.0
24 Handball 1027 1675 194 155.0
25 Hockey 996 2829 360 158.0
26 Ice Hockey 498 3386 407 101.0
27 Judo 757 1967 88 49.0
28 Luge 228 544 47 15.0
29 Modern Pentathlon 114 750 57 5.0
30 Motorboating 1 13 7 NaN
31 Rowing 1483 6204 740 238.0
32 Rugby Sevens 148 151 13 12.0
33 Sailing 629 3851 400 47.0
34 Shooting 737 4145 361 49.0
35 Short Track Speed Skating 209 235 48 47.0
36 Skeleton 45 101 6 4.0
37 Ski Jumping 30 844 68 1.0
38 Snowboarding 239 328 15 15.0
39 Speed Skating 528 1054 112 81.0
40 Swimming 3621 5144 606 493.0
41 Table Tennis 377 372 27 27.0
42 Taekwondo 229 241 20 20.0
43 Tennis 486 760 62 44.0
44 Trampolining 44 49 5 5.0
45 Triathlon 175 180 5 5.0
46 Volleyball 1129 1374 166 156.0
47 Water Polo 337 2262 287 63.0
48 Weightlifting 356 2526 182 35.0
49 Wrestling 222 4766 395 18.0
In [231]:
# Replace null values with 0
sportgender_goldmedal_df = pysqldf(""" SELECT Sport,female_ath_count,male_ath_count,male_goldmedal_cnt,
                                          (CASE WHEN female_goldmedal_cnt IS NULL THEN 0 ELSE female_goldmedal_cnt END) AS female_goldmedal_cnt
                                   FROM sportgender_goldmedal_df
                                """)
sportgender_goldmedal_df
Out[231]:
Sport female_ath_count male_ath_count male_goldmedal_cnt female_goldmedal_cnt
0 Alpine Skiing 996 1739 71 72.0
1 Alpinism 1 24 24 1.0
2 Archery 500 613 92 43.0
3 Art Competitions 204 1610 48 1.0
4 Athletics 6529 15542 911 428.0
5 Badminton 412 399 27 27.0
6 Basketball 932 2481 234 131.0
7 Beach Volleyball 189 194 12 12.0
8 Biathlon 371 764 86 50.0
9 Bobsleigh 109 1585 125 8.0
10 Boxing 65 5197 246 6.0
11 Canoeing 702 2504 294 94.0
12 Croquet 3 7 4 0.0
13 Cross Country Skiing 717 1683 143 113.0
14 Curling 160 186 28 22.0
15 Cycling 714 5105 366 58.0
16 Diving 635 831 75 67.0
17 Equestrianism 459 1886 269 55.0
18 Fencing 880 3243 475 119.0
19 Figure Skating 824 748 66 65.0
20 Football 734 5427 414 101.0
21 Freestyle Skiing 267 359 17 17.0
22 Golf 70 148 13 2.0
23 Gymnastics 1499 2635 557 234.0
24 Handball 1027 1675 194 155.0
25 Hockey 996 2829 360 158.0
26 Ice Hockey 498 3386 407 101.0
27 Judo 757 1967 88 49.0
28 Luge 228 544 47 15.0
29 Modern Pentathlon 114 750 57 5.0
30 Motorboating 1 13 7 0.0
31 Rowing 1483 6204 740 238.0
32 Rugby Sevens 148 151 13 12.0
33 Sailing 629 3851 400 47.0
34 Shooting 737 4145 361 49.0
35 Short Track Speed Skating 209 235 48 47.0
36 Skeleton 45 101 6 4.0
37 Ski Jumping 30 844 68 1.0
38 Snowboarding 239 328 15 15.0
39 Speed Skating 528 1054 112 81.0
40 Swimming 3621 5144 606 493.0
41 Table Tennis 377 372 27 27.0
42 Taekwondo 229 241 20 20.0
43 Tennis 486 760 62 44.0
44 Trampolining 44 49 5 5.0
45 Triathlon 175 180 5 5.0
46 Volleyball 1129 1374 166 156.0
47 Water Polo 337 2262 287 63.0
48 Weightlifting 356 2526 182 35.0
49 Wrestling 222 4766 395 18.0
In [233]:
sportgender_gold_df = pysqldf(""" SELECT 
                                        Sport,female_ath_count,male_ath_count,male_goldmedal_cnt,female_goldmedal_cnt,
                                        CAST(female_goldmedal_cnt AS float) / CAST(female_ath_count AS float) AS female_avggoldmedal, 
                                        CAST(male_goldmedal_cnt AS float) / CAST(male_ath_count AS float) AS male_avggoldmedal 
                                   FROM sportgender_goldmedal_df; """)
sportgender_gold_df
Out[233]:
Sport female_ath_count male_ath_count male_goldmedal_cnt female_goldmedal_cnt female_avggoldmedal male_avggoldmedal
0 Alpine Skiing 996 1739 71 72.0 0.072289 0.040828
1 Alpinism 1 24 24 1.0 1.000000 1.000000
2 Archery 500 613 92 43.0 0.086000 0.150082
3 Art Competitions 204 1610 48 1.0 0.004902 0.029814
4 Athletics 6529 15542 911 428.0 0.065554 0.058615
5 Badminton 412 399 27 27.0 0.065534 0.067669
6 Basketball 932 2481 234 131.0 0.140558 0.094317
7 Beach Volleyball 189 194 12 12.0 0.063492 0.061856
8 Biathlon 371 764 86 50.0 0.134771 0.112565
9 Bobsleigh 109 1585 125 8.0 0.073394 0.078864
10 Boxing 65 5197 246 6.0 0.092308 0.047335
11 Canoeing 702 2504 294 94.0 0.133903 0.117412
12 Croquet 3 7 4 0.0 0.000000 0.571429
13 Cross Country Skiing 717 1683 143 113.0 0.157601 0.084967
14 Curling 160 186 28 22.0 0.137500 0.150538
15 Cycling 714 5105 366 58.0 0.081232 0.071694
16 Diving 635 831 75 67.0 0.105512 0.090253
17 Equestrianism 459 1886 269 55.0 0.119826 0.142630
18 Fencing 880 3243 475 119.0 0.135227 0.146469
19 Figure Skating 824 748 66 65.0 0.078883 0.088235
20 Football 734 5427 414 101.0 0.137602 0.076285
21 Freestyle Skiing 267 359 17 17.0 0.063670 0.047354
22 Golf 70 148 13 2.0 0.028571 0.087838
23 Gymnastics 1499 2635 557 234.0 0.156104 0.211385
24 Handball 1027 1675 194 155.0 0.150925 0.115821
25 Hockey 996 2829 360 158.0 0.158635 0.127253
26 Ice Hockey 498 3386 407 101.0 0.202811 0.120201
27 Judo 757 1967 88 49.0 0.064729 0.044738
28 Luge 228 544 47 15.0 0.065789 0.086397
29 Modern Pentathlon 114 750 57 5.0 0.043860 0.076000
30 Motorboating 1 13 7 0.0 0.000000 0.538462
31 Rowing 1483 6204 740 238.0 0.160486 0.119278
32 Rugby Sevens 148 151 13 12.0 0.081081 0.086093
33 Sailing 629 3851 400 47.0 0.074722 0.103869
34 Shooting 737 4145 361 49.0 0.066486 0.087093
35 Short Track Speed Skating 209 235 48 47.0 0.224880 0.204255
36 Skeleton 45 101 6 4.0 0.088889 0.059406
37 Ski Jumping 30 844 68 1.0 0.033333 0.080569
38 Snowboarding 239 328 15 15.0 0.062762 0.045732
39 Speed Skating 528 1054 112 81.0 0.153409 0.106262
40 Swimming 3621 5144 606 493.0 0.136150 0.117807
41 Table Tennis 377 372 27 27.0 0.071618 0.072581
42 Taekwondo 229 241 20 20.0 0.087336 0.082988
43 Tennis 486 760 62 44.0 0.090535 0.081579
44 Trampolining 44 49 5 5.0 0.113636 0.102041
45 Triathlon 175 180 5 5.0 0.028571 0.027778
46 Volleyball 1129 1374 166 156.0 0.138175 0.120815
47 Water Polo 337 2262 287 63.0 0.186944 0.126879
48 Weightlifting 356 2526 182 35.0 0.098315 0.072051
49 Wrestling 222 4766 395 18.0 0.081081 0.082879

Using Pandas¶

Descriptive Stats¶

In [105]:
# describe overall age, height, weight
olympics_games_distinct[['Age', 'Height', 'Weight']].describe()
Out[105]:
Age Height Weight
count 260416.000000 210917.000000 207146.000000
mean 25.454776 175.338953 70.680872
std 6.163869 10.518507 14.330655
min 10.000000 127.000000 25.000000
25% 21.000000 168.000000 60.000000
50% 24.000000 175.000000 70.000000
75% 28.000000 183.000000 79.000000
max 97.000000 226.000000 214.000000
In [106]:
# describe age, height, weight for female athletes 
female_df[['Age', 'Height', 'Weight']].describe()
Out[106]:
Age Height Weight
count 73976.000000 67378.000000 66585.000000
mean 23.697713 167.839740 60.011414
std 5.695049 8.778528 10.195580
min 11.000000 127.000000 25.000000
25% 20.000000 162.000000 54.000000
50% 23.000000 168.000000 59.000000
75% 27.000000 173.000000 65.000000
max 74.000000 213.000000 167.000000
In [107]:
# describe age, height, weight for male athletes 
male_df[['Age', 'Height', 'Weight']].describe()
Out[107]:
Age Height Weight
count 186440.000000 143539.000000 140561.000000
mean 26.151947 178.859125 75.735090
std 6.203904 9.360033 13.188625
min 10.000000 127.000000 28.000000
25% 22.000000 172.000000 67.000000
50% 25.000000 179.000000 74.000000
75% 29.000000 185.000000 83.000000
max 97.000000 226.000000 214.000000
In [108]:
male_medals_df = pysqldf("""SELECT * FROM athlete_medals_df WHERE Sex = 'M'""")
male_medals_df.describe()
Out[108]:
ID avg_age avg_height avg_weight total_medal
count 101590.000000 101590.000000 101590.000000 101590.000000 101590.000000
mean 67174.582420 25.818763 178.512115 75.072944 0.280726
std 39156.392093 5.658437 8.061680 11.576350 0.689189
min 1.000000 10.000000 127.000000 28.000000 0.000000
25% 33104.250000 22.000000 176.000000 71.000000 0.000000
50% 66973.000000 25.000000 176.315410 71.941320 0.000000
75% 101153.750000 28.000000 183.000000 80.000000 0.000000
max 135571.000000 97.000000 226.000000 214.000000 28.000000
In [109]:
female_medals_df= pysqldf("""SELECT * FROM athlete_medals_df WHERE Sex = 'F'""")
female_medals_df.describe()
Out[109]:
ID avg_age avg_height avg_weight total_medal
count 33981.000000 33981.000000 33981.000000 33981.000000 33981.000000
mean 69613.901239 23.902592 169.748115 62.578981 0.331156
std 39018.934491 5.200672 8.354558 10.336990 0.808477
min 5.000000 11.000000 127.000000 25.000000 0.000000
25% 36379.000000 20.000000 164.000000 55.000000 0.000000
50% 69604.000000 23.666667 170.000000 62.000000 0.000000
75% 103309.000000 27.000000 176.315410 71.000000 0.000000
max 135568.000000 74.000000 213.000000 167.000000 18.000000

Prove/disprove hypothesis 1¶

Metric 1: Total medals¶
In [237]:
medalavg_awayhome_df.columns
Out[237]:
Index(['Games', 'home_team_cnt', 'away_team_cnt', 'home_medal_count',
       'away_medal_count', 'home_avgmedal', 'away_avgmedal'],
      dtype='object')
In [238]:
# games where home medal count > away medal count 
medalavg_awayhome_df[medalavg_awayhome_df['home_medal_count'] > medalavg_awayhome_df['away_medal_count']]
Out[238]:
Games home_team_cnt away_team_cnt home_medal_count away_medal_count home_avgmedal away_avgmedal
2 1904 Summer 59 27 394 92 6.677966 3.407407
In [239]:
# games where home teams count < away teams count 
medalavg_awayhome_df[medalavg_awayhome_df['home_medal_count'] < medalavg_awayhome_df['away_medal_count']]
Out[239]:
Games home_team_cnt away_team_cnt home_medal_count away_medal_count home_avgmedal away_avgmedal
0 1896 Summer 5 13 48 95 9.600000 7.307692
1 1900 Summer 122 77 225 368 1.844262 4.779221
3 1906 Summer 21 33 102 356 4.857143 10.787879
4 1908 Summer 34 39 368 463 10.823529 11.871795
5 1912 Summer 17 84 190 751 11.176471 8.940476
6 1920 Summer 9 63 188 1120 20.888889 17.777778
7 1924 Summer 5 85 110 722 22.000000 8.494118
8 1924 Winter 3 25 10 120 3.333333 4.800000
9 1928 Summer 3 64 57 677 19.000000 10.578125
10 1928 Winter 3 38 12 77 4.000000 2.026316
11 1932 Summer 4 55 189 458 47.250000 8.327273
12 1932 Winter 3 26 34 58 11.333333 2.230769
13 1936 Summer 5 100 224 693 44.800000 6.930000
14 1936 Winter 3 51 7 101 2.333333 1.980392
15 1948 Summer 5 106 61 791 12.200000 7.462264
16 1948 Winter 3 43 28 107 9.333333 2.488372
17 1952 Summer 5 128 40 857 8.000000 6.695312
18 1952 Winter 3 49 19 117 6.333333 2.387755
19 1956 Summer 6 118 72 821 12.000000 6.957627
20 1956 Winter 3 53 8 142 2.666667 2.679245
21 1960 Summer 5 181 88 823 17.600000 4.546961
22 1960 Winter 4 36 27 120 6.750000 3.333333
23 1964 Summer 5 163 62 967 12.400000 5.932515
24 1964 Winter 3 63 17 169 5.666667 2.682540
25 1968 Summer 1 111 9 1048 9.000000 9.441441
26 1968 Winter 3 67 9 190 3.000000 2.835821
27 1972 Summer 8 131 253 962 31.625000 7.343511
28 1972 Winter 3 60 3 196 1.000000 3.266667
29 1976 Summer 1 91 23 1297 23.000000 14.252747
30 1976 Winter 3 67 7 204 2.333333 3.044776
31 1980 Summer 1 79 442 942 442.000000 11.924051
32 1980 Winter 3 55 30 188 10.000000 3.418182
33 1984 Summer 1 139 352 1124 352.000000 8.086331
34 1984 Winter 3 77 1 221 0.333333 2.870130
35 1988 Summer 3 174 77 1505 25.666667 8.649425
36 1988 Winter 4 94 6 257 1.500000 2.734043
37 1992 Summer 1 211 69 1643 69.000000 7.786730
38 1992 Winter 4 107 12 306 3.000000 2.859813
39 1994 Winter 1 100 30 301 30.000000 3.010000
40 1996 Summer 4 242 259 1583 64.750000 6.541322
41 1998 Winter 3 103 13 427 4.333333 4.145631
42 2000 Summer 4 239 183 1821 45.750000 7.619247
43 2002 Winter 3 111 84 394 28.000000 3.549550
44 2004 Summer 3 257 31 1970 10.333333 7.665370
45 2006 Winter 3 110 25 501 8.333333 4.554545
46 2008 Summer 5 287 184 1864 36.800000 6.494774
47 2010 Winter 3 113 90 430 30.000000 3.805310
48 2012 Summer 3 242 126 1815 42.000000 7.500000
49 2014 Winter 4 115 68 529 17.000000 4.600000
50 2016 Summer 3 245 50 1973 16.666667 8.053061
In [240]:
# games where avg home medals < avg away medals 
df1 = medalavg_awayhome_df[medalavg_awayhome_df['home_avgmedal'] < medalavg_awayhome_df['away_avgmedal']]
print(df1)
print(df1.count())
          Games  home_team_cnt  away_team_cnt  home_medal_count  \
1   1900 Summer            122             77               225   
3   1906 Summer             21             33               102   
4   1908 Summer             34             39               368   
8   1924 Winter              3             25                10   
20  1956 Winter              3             53                 8   
25  1968 Summer              1            111                 9   
28  1972 Winter              3             60                 3   
30  1976 Winter              3             67                 7   
34  1984 Winter              3             77                 1   
36  1988 Winter              4             94                 6   

    away_medal_count  home_avgmedal  away_avgmedal  
1                368       1.844262       4.779221  
3                356       4.857143      10.787879  
4                463      10.823529      11.871795  
8                120       3.333333       4.800000  
20               142       2.666667       2.679245  
25              1048       9.000000       9.441441  
28               196       1.000000       3.266667  
30               204       2.333333       3.044776  
34               221       0.333333       2.870130  
36               257       1.500000       2.734043  
Games               10
home_team_cnt       10
away_team_cnt       10
home_medal_count    10
away_medal_count    10
home_avgmedal       10
away_avgmedal       10
dtype: int64
In [160]:
medalavg_awayhome_df
Out[160]:
Games home_team_cnt away_team_cnt home_medal_count away_medal_count home_avgmedal away_avgmedal
0 1896 Summer 5 13 48 95 9.600000 7.307692
1 1900 Summer 122 77 225 368 1.844262 4.779221
2 1904 Summer 59 27 394 92 6.677966 3.407407
3 1906 Summer 21 33 102 356 4.857143 10.787879
4 1908 Summer 34 39 368 463 10.823529 11.871795
5 1912 Summer 17 84 190 751 11.176471 8.940476
6 1920 Summer 9 63 188 1120 20.888889 17.777778
7 1924 Summer 5 85 110 722 22.000000 8.494118
8 1924 Winter 3 25 10 120 3.333333 4.800000
9 1928 Summer 3 64 57 677 19.000000 10.578125
10 1928 Winter 3 38 12 77 4.000000 2.026316
11 1932 Summer 4 55 189 458 47.250000 8.327273
12 1932 Winter 3 26 34 58 11.333333 2.230769
13 1936 Summer 5 100 224 693 44.800000 6.930000
14 1936 Winter 3 51 7 101 2.333333 1.980392
15 1948 Summer 5 106 61 791 12.200000 7.462264
16 1948 Winter 3 43 28 107 9.333333 2.488372
17 1952 Summer 5 128 40 857 8.000000 6.695312
18 1952 Winter 3 49 19 117 6.333333 2.387755
19 1956 Summer 6 118 72 821 12.000000 6.957627
20 1956 Winter 3 53 8 142 2.666667 2.679245
21 1960 Summer 5 181 88 823 17.600000 4.546961
22 1960 Winter 4 36 27 120 6.750000 3.333333
23 1964 Summer 5 163 62 967 12.400000 5.932515
24 1964 Winter 3 63 17 169 5.666667 2.682540
25 1968 Summer 1 111 9 1048 9.000000 9.441441
26 1968 Winter 3 67 9 190 3.000000 2.835821
27 1972 Summer 8 131 253 962 31.625000 7.343511
28 1972 Winter 3 60 3 196 1.000000 3.266667
29 1976 Summer 1 91 23 1297 23.000000 14.252747
30 1976 Winter 3 67 7 204 2.333333 3.044776
31 1980 Summer 1 79 442 942 442.000000 11.924051
32 1980 Winter 3 55 30 188 10.000000 3.418182
33 1984 Summer 1 139 352 1124 352.000000 8.086331
34 1984 Winter 3 77 1 221 0.333333 2.870130
35 1988 Summer 3 174 77 1505 25.666667 8.649425
36 1988 Winter 4 94 6 257 1.500000 2.734043
37 1992 Summer 1 211 69 1643 69.000000 7.786730
38 1992 Winter 4 107 12 306 3.000000 2.859813
39 1994 Winter 1 100 30 301 30.000000 3.010000
40 1996 Summer 4 242 259 1583 64.750000 6.541322
41 1998 Winter 3 103 13 427 4.333333 4.145631
42 2000 Summer 4 239 183 1821 45.750000 7.619247
43 2002 Winter 3 111 84 394 28.000000 3.549550
44 2004 Summer 3 257 31 1970 10.333333 7.665370
45 2006 Winter 3 110 25 501 8.333333 4.554545
46 2008 Summer 5 287 184 1864 36.800000 6.494774
47 2010 Winter 3 113 90 430 30.000000 3.805310
48 2012 Summer 3 242 126 1815 42.000000 7.500000
49 2014 Winter 4 115 68 529 17.000000 4.600000
50 2016 Summer 3 245 50 1973 16.666667 8.053061
In [242]:
# games where avg home medals > avg away medals 
df2 = medalavg_awayhome_df[medalavg_awayhome_df['home_avgmedal'] > medalavg_awayhome_df['away_avgmedal']]
print(df2)
print(df2.count())
          Games  home_team_cnt  away_team_cnt  home_medal_count  \
0   1896 Summer              5             13                48   
2   1904 Summer             59             27               394   
5   1912 Summer             17             84               190   
6   1920 Summer              9             63               188   
7   1924 Summer              5             85               110   
9   1928 Summer              3             64                57   
10  1928 Winter              3             38                12   
11  1932 Summer              4             55               189   
12  1932 Winter              3             26                34   
13  1936 Summer              5            100               224   
14  1936 Winter              3             51                 7   
15  1948 Summer              5            106                61   
16  1948 Winter              3             43                28   
17  1952 Summer              5            128                40   
18  1952 Winter              3             49                19   
19  1956 Summer              6            118                72   
21  1960 Summer              5            181                88   
22  1960 Winter              4             36                27   
23  1964 Summer              5            163                62   
24  1964 Winter              3             63                17   
26  1968 Winter              3             67                 9   
27  1972 Summer              8            131               253   
29  1976 Summer              1             91                23   
31  1980 Summer              1             79               442   
32  1980 Winter              3             55                30   
33  1984 Summer              1            139               352   
35  1988 Summer              3            174                77   
37  1992 Summer              1            211                69   
38  1992 Winter              4            107                12   
39  1994 Winter              1            100                30   
40  1996 Summer              4            242               259   
41  1998 Winter              3            103                13   
42  2000 Summer              4            239               183   
43  2002 Winter              3            111                84   
44  2004 Summer              3            257                31   
45  2006 Winter              3            110                25   
46  2008 Summer              5            287               184   
47  2010 Winter              3            113                90   
48  2012 Summer              3            242               126   
49  2014 Winter              4            115                68   
50  2016 Summer              3            245                50   

    away_medal_count  home_avgmedal  away_avgmedal  
0                 95       9.600000       7.307692  
2                 92       6.677966       3.407407  
5                751      11.176471       8.940476  
6               1120      20.888889      17.777778  
7                722      22.000000       8.494118  
9                677      19.000000      10.578125  
10                77       4.000000       2.026316  
11               458      47.250000       8.327273  
12                58      11.333333       2.230769  
13               693      44.800000       6.930000  
14               101       2.333333       1.980392  
15               791      12.200000       7.462264  
16               107       9.333333       2.488372  
17               857       8.000000       6.695312  
18               117       6.333333       2.387755  
19               821      12.000000       6.957627  
21               823      17.600000       4.546961  
22               120       6.750000       3.333333  
23               967      12.400000       5.932515  
24               169       5.666667       2.682540  
26               190       3.000000       2.835821  
27               962      31.625000       7.343511  
29              1297      23.000000      14.252747  
31               942     442.000000      11.924051  
32               188      10.000000       3.418182  
33              1124     352.000000       8.086331  
35              1505      25.666667       8.649425  
37              1643      69.000000       7.786730  
38               306       3.000000       2.859813  
39               301      30.000000       3.010000  
40              1583      64.750000       6.541322  
41               427       4.333333       4.145631  
42              1821      45.750000       7.619247  
43               394      28.000000       3.549550  
44              1970      10.333333       7.665370  
45               501       8.333333       4.554545  
46              1864      36.800000       6.494774  
47               430      30.000000       3.805310  
48              1815      42.000000       7.500000  
49               529      17.000000       4.600000  
50              1973      16.666667       8.053061  
Games               41
home_team_cnt       41
away_team_cnt       41
home_medal_count    41
away_medal_count    41
home_avgmedal       41
away_avgmedal       41
dtype: int64
In [241]:
# using correlation 
medalavg_awayhome_df[['home_team_cnt','away_team_cnt','home_medal_count','away_medal_count','home_avgmedal','away_avgmedal']].corr(method='pearson')
Out[241]:
home_team_cnt away_team_cnt home_medal_count away_medal_count home_avgmedal away_avgmedal
home_team_cnt 1.000000 -0.170452 0.399454 -0.145728 -0.113990 0.012361
away_team_cnt -0.170452 1.000000 0.113671 0.894231 0.125224 0.178024
home_medal_count 0.399454 0.113671 1.000000 0.266063 0.610863 0.447170
away_medal_count -0.145728 0.894231 0.266063 1.000000 0.245816 0.559944
home_avgmedal -0.113990 0.125224 0.610863 0.245816 1.000000 0.306029
away_avgmedal 0.012361 0.178024 0.447170 0.559944 0.306029 1.000000
Metric 2: Total gold medals¶
In [245]:
# number of games where home teams' gold > away teams' gold 
print((goldcnt_awayhome_df[goldcnt_awayhome_df['home_avggoldmedal'] > goldcnt_awayhome_df['away_avggoldmedal']]).count())
Games                36
home_team_cnt        36
away_team_cnt        36
home_gold_count      36
away_gold_count      36
home_avggoldmedal    36
away_avggoldmedal    36
dtype: int64
In [247]:
# number of games where home teams' gold < away teams' gold 
print((goldcnt_awayhome_df[goldcnt_awayhome_df['home_avggoldmedal'] < goldcnt_awayhome_df['away_avggoldmedal']]).count())
Games                15
home_team_cnt        15
away_team_cnt        15
home_gold_count      15
away_gold_count      15
home_avggoldmedal    15
away_avggoldmedal    15
dtype: int64

Prove/disprove hypothesis 2¶

Metric 1: Total medals¶
In [115]:
# correlation 
athlete_medals_df[['avg_age','avg_height','avg_weight','total_medal']].corr(method='pearson')
Out[115]:
avg_age avg_height avg_weight total_medal
avg_age 1.000000 0.076059 0.141231 0.046408
avg_height 0.076059 1.000000 0.762834 0.043841
avg_weight 0.141231 0.762834 1.000000 0.044606
total_medal 0.046408 0.043841 0.044606 1.000000
In [116]:
# using p-values 
x = athlete_medals_df[['avg_age','avg_height','avg_weight']]
y = athlete_medals_df['total_medal']

# with sklearn
regr = LinearRegression()
regr.fit(x, y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# with statsmodels
x = sm.add_constant(x) # adding a constant
 
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
 
print_model = model.summary()
print(print_model)
Intercept: 
 -0.29632281634169494
Coefficients: 
 [0.00537322 0.00213988 0.00105983]
                            OLS Regression Results                            
==============================================================================
Dep. Variable:            total_medal   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     178.2
Date:                Fri, 16 Sep 2022   Prob (F-statistic):          2.72e-115
Time:                        16:06:44   Log-Likelihood:            -1.4780e+05
No. Observations:              135571   AIC:                         2.956e+05
Df Residuals:                  135567   BIC:                         2.957e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2963      0.048     -6.114      0.000      -0.391      -0.201
avg_age        0.0054      0.000     15.242      0.000       0.005       0.006
avg_height     0.0021      0.000      6.347      0.000       0.001       0.003
avg_weight     0.0011      0.000      4.349      0.000       0.001       0.002
==============================================================================
Omnibus:                   137690.654   Durbin-Watson:                   0.009
Prob(Omnibus):                  0.000   Jarque-Bera (JB):         15800883.137
Skew:                           4.824   Prob(JB):                         0.00
Kurtosis:                      55.001   Cond. No.                     4.77e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [165]:
ath_df1 = (pysqldf(""" SELECT ID, Name, Games, Age, Height, Weight, COUNT(medal) AS medal_cnt
                    FROM olympics_games_distinct
                    GROUP BY ID, Name, Games, Age, Height, Weight; """)) 
In [166]:
# correlation between age, height, and weight and number of medals instead of using average values 
ath_df1[['Age', 'Height', 'Weight','medal_cnt']].corr(method='pearson')
Out[166]:
Age Height Weight medal_cnt
Age 1.000000 0.082661 0.163917 0.010306
Height 0.082661 1.000000 0.774509 0.041712
Weight 0.163917 0.774509 1.000000 0.037541
medal_cnt 0.010306 0.041712 0.037541 1.000000
In [172]:
ath_df2 = (pysqldf(""" SELECT ID, Name, Games, Age, Height, Weight,Medal, COUNT(medal) AS medal_cnt
                    FROM olympics_games_distinct
                    WHERE Medal = 'Gold'
                    GROUP BY ID, Name, Games, Age, Height, Weight, Medal; """)) 
In [173]:
# correlation between age, height, and weight and number of gold medals 
ath_df2[['Age', 'Height', 'Weight','medal_cnt']].corr(method='pearson')
Out[173]:
Age Height Weight medal_cnt
Age 1.000000 0.094635 0.167282 -0.049534
Height 0.094635 1.000000 0.804981 -0.045838
Weight 0.167282 0.804981 1.000000 -0.083971
medal_cnt -0.049534 -0.045838 -0.083971 1.000000
Metric 2: Total gold medals¶
In [265]:
athletes_gold_df.columns
Out[265]:
Index(['ID', 'Name', 'Sex', 'avg_age', 'avg_height', 'avg_weight',
       'total_goldmedal'],
      dtype='object')
In [266]:
# correlation 
athletes_gold_df[['avg_age','avg_height','avg_weight','total_goldmedal']].corr(method='pearson')
Out[266]:
avg_age avg_height avg_weight total_goldmedal
avg_age 1.000000 0.075681 0.132929 0.027825
avg_height 0.075681 1.000000 0.785820 -0.028184
avg_weight 0.132929 0.785820 1.000000 -0.036213
total_goldmedal 0.027825 -0.028184 -0.036213 1.000000
In [267]:
# using p-values 
x = athletes_gold_df[['avg_age','avg_height','avg_weight']]
y = athletes_gold_df['total_goldmedal']

# with sklearn
regr = LinearRegression()
regr.fit(x, y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# with statsmodels
x = sm.add_constant(x) # adding a constant
 
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
 
print_model = model.summary()
print(print_model)
Intercept: 
 1.3080802087573147
Coefficients: 
 [ 0.00437958  0.00024712 -0.00245168]
                            OLS Regression Results                            
==============================================================================
Dep. Variable:        total_goldmedal   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     8.356
Date:                Sun, 18 Sep 2022   Prob (F-statistic):           1.52e-05
Time:                        14:04:47   Log-Likelihood:                -11695.
No. Observations:               10425   AIC:                         2.340e+04
Df Residuals:                   10421   BIC:                         2.343e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.3081      0.173      7.575      0.000       0.970       1.647
avg_age        0.0044      0.001      3.372      0.001       0.002       0.007
avg_height     0.0002      0.001      0.203      0.839      -0.002       0.003
avg_weight    -0.0025      0.001     -2.709      0.007      -0.004      -0.001
==============================================================================
Omnibus:                    12234.226   Durbin-Watson:                   0.036
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          3492112.221
Skew:                           5.835   Prob(JB):                         0.00
Kurtosis:                      91.900   Cond. No.                     4.63e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.63e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

Prove/disprove hypothesis 3¶

Metric 1: Total medals¶
In [117]:
sportgendermedal_df
Out[117]:
Sport female_ath_count male_ath_count female_medal_cnt male_medal_cnt female_avgmedal male_avgmedal
0 Alpine Skiing 996 1739 213 215 0.213855 0.123634
1 Alpinism 1 24 1 24 1.000000 1.000000
2 Archery 500 613 121 232 0.242000 0.378467
3 Art Competitions 204 1610 11 145 0.053922 0.090062
4 Athletics 6529 15542 1275 2694 0.195283 0.173337
5 Badminton 412 399 84 84 0.203883 0.210526
6 Basketball 932 2481 393 687 0.421674 0.276904
7 Beach Volleyball 189 194 36 36 0.190476 0.185567
8 Biathlon 371 764 150 258 0.404313 0.337696
9 Bobsleigh 109 1585 24 374 0.220183 0.235962
10 Boxing 65 5197 24 920 0.369231 0.177025
11 Canoeing 702 2504 282 883 0.401709 0.352636
12 Croquet 3 7 0 8 0.000000 1.142857
13 Cross Country Skiing 717 1683 339 437 0.472803 0.259655
14 Curling 160 186 68 84 0.425000 0.451613
15 Cycling 714 5105 176 1087 0.246499 0.212929
16 Diving 635 831 201 226 0.316535 0.271961
17 Equestrianism 459 1886 207 758 0.450980 0.401909
18 Fencing 880 3243 349 1394 0.396591 0.429849
19 Figure Skating 824 748 191 195 0.231796 0.260695
20 Football 734 5427 302 1269 0.411444 0.233831
21 Freestyle Skiing 267 359 51 51 0.191011 0.142061
22 Golf 70 148 6 40 0.085714 0.270270
23 Gymnastics 1499 2635 701 1555 0.467645 0.590133
24 Handball 1027 1675 472 588 0.459591 0.351045
25 Hockey 996 2829 478 1050 0.479920 0.371156
26 Ice Hockey 498 3386 300 1230 0.602410 0.363260
27 Judo 757 1967 196 351 0.258917 0.178444
28 Luge 228 544 45 135 0.197368 0.248162
29 Modern Pentathlon 114 750 15 171 0.131579 0.228000
30 Motorboating 1 13 0 7 0.000000 0.538462
31 Rowing 1483 6204 720 2225 0.485502 0.358640
32 Rugby Sevens 148 151 36 38 0.243243 0.251656
33 Sailing 629 3851 134 1087 0.213037 0.282264
34 Shooting 737 4145 146 1082 0.198100 0.261037
35 Short Track Speed Skating 209 235 140 144 0.669856 0.612766
36 Skeleton 45 101 12 18 0.266667 0.178218
37 Ski Jumping 30 844 3 204 0.100000 0.241706
38 Snowboarding 239 328 45 45 0.188285 0.137195
39 Speed Skating 528 1054 242 338 0.458333 0.320683
40 Swimming 3621 5144 1374 1674 0.379453 0.325428
41 Table Tennis 377 372 84 84 0.222812 0.225806
42 Taekwondo 229 241 72 72 0.314410 0.298755
43 Tennis 486 760 140 200 0.288066 0.263158
44 Trampolining 44 49 15 15 0.340909 0.306122
45 Triathlon 175 180 15 15 0.085714 0.083333
46 Volleyball 1129 1374 474 495 0.419841 0.360262
47 Water Polo 337 2262 191 866 0.566766 0.382847
48 Weightlifting 356 2526 105 541 0.294944 0.214173
49 Wrestling 222 4766 68 1228 0.306306 0.257658
In [250]:
# count number of sports where male atletes have more medals than female athletes 
display(pysqldf(""" SELECT *
                    FROM sportgender_gold_df
                    WHERE male_avgmedal > female_avgmedal; """))
Sport female_ath_count male_ath_count female_medal_cnt male_medal_cnt female_avgmedal male_avgmedal
0 Archery 500 613 121 232 0.242000 0.378467
1 Art Competitions 204 1610 11 145 0.053922 0.090062
2 Badminton 412 399 84 84 0.203883 0.210526
3 Bobsleigh 109 1585 24 374 0.220183 0.235962
4 Croquet 3 7 0 8 0.000000 1.142857
5 Curling 160 186 68 84 0.425000 0.451613
6 Fencing 880 3243 349 1394 0.396591 0.429849
7 Figure Skating 824 748 191 195 0.231796 0.260695
8 Golf 70 148 6 40 0.085714 0.270270
9 Gymnastics 1499 2635 701 1555 0.467645 0.590133
10 Luge 228 544 45 135 0.197368 0.248162
11 Modern Pentathlon 114 750 15 171 0.131579 0.228000
12 Motorboating 1 13 0 7 0.000000 0.538462
13 Rugby Sevens 148 151 36 38 0.243243 0.251656
14 Sailing 629 3851 134 1087 0.213037 0.282264
15 Shooting 737 4145 146 1082 0.198100 0.261037
16 Ski Jumping 30 844 3 204 0.100000 0.241706
17 Table Tennis 377 372 84 84 0.222812 0.225806
In [251]:
# count number of sports where male atletes have less medals than female athletes 
display(pysqldf(""" SELECT *
                    FROM sportgendermedal_df
                    WHERE male_avgmedal < female_avgmedal; """))
Sport female_ath_count male_ath_count female_medal_cnt male_medal_cnt female_avgmedal male_avgmedal
0 Alpine Skiing 996 1739 213 215 0.213855 0.123634
1 Athletics 6529 15542 1275 2694 0.195283 0.173337
2 Basketball 932 2481 393 687 0.421674 0.276904
3 Beach Volleyball 189 194 36 36 0.190476 0.185567
4 Biathlon 371 764 150 258 0.404313 0.337696
5 Boxing 65 5197 24 920 0.369231 0.177025
6 Canoeing 702 2504 282 883 0.401709 0.352636
7 Cross Country Skiing 717 1683 339 437 0.472803 0.259655
8 Cycling 714 5105 176 1087 0.246499 0.212929
9 Diving 635 831 201 226 0.316535 0.271961
10 Equestrianism 459 1886 207 758 0.450980 0.401909
11 Football 734 5427 302 1269 0.411444 0.233831
12 Freestyle Skiing 267 359 51 51 0.191011 0.142061
13 Handball 1027 1675 472 588 0.459591 0.351045
14 Hockey 996 2829 478 1050 0.479920 0.371156
15 Ice Hockey 498 3386 300 1230 0.602410 0.363260
16 Judo 757 1967 196 351 0.258917 0.178444
17 Rowing 1483 6204 720 2225 0.485502 0.358640
18 Short Track Speed Skating 209 235 140 144 0.669856 0.612766
19 Skeleton 45 101 12 18 0.266667 0.178218
20 Snowboarding 239 328 45 45 0.188285 0.137195
21 Speed Skating 528 1054 242 338 0.458333 0.320683
22 Swimming 3621 5144 1374 1674 0.379453 0.325428
23 Taekwondo 229 241 72 72 0.314410 0.298755
24 Tennis 486 760 140 200 0.288066 0.263158
25 Trampolining 44 49 15 15 0.340909 0.306122
26 Triathlon 175 180 15 15 0.085714 0.083333
27 Volleyball 1129 1374 474 495 0.419841 0.360262
28 Water Polo 337 2262 191 866 0.566766 0.382847
29 Weightlifting 356 2526 105 541 0.294944 0.214173
30 Wrestling 222 4766 68 1228 0.306306 0.257658
Metric 2: Total gold medals¶
In [264]:
sportgender_gold_df.columns
Out[264]:
Index(['Sport', 'female_ath_count', 'male_ath_count', 'male_goldmedal_cnt',
       'female_goldmedal_cnt', 'female_avggoldmedal', 'male_avggoldmedal'],
      dtype='object')
In [262]:
# sports where male atletes have more medals than female athletes 
display(pysqldf(""" SELECT *
                    FROM sportgender_gold_df
                    WHERE male_avggoldmedal > female_avggoldmedal; """))
Sport female_ath_count male_ath_count male_goldmedal_cnt female_goldmedal_cnt female_avggoldmedal male_avggoldmedal
0 Archery 500 613 92 43.0 0.086000 0.150082
1 Art Competitions 204 1610 48 1.0 0.004902 0.029814
2 Badminton 412 399 27 27.0 0.065534 0.067669
3 Bobsleigh 109 1585 125 8.0 0.073394 0.078864
4 Croquet 3 7 4 0.0 0.000000 0.571429
5 Curling 160 186 28 22.0 0.137500 0.150538
6 Equestrianism 459 1886 269 55.0 0.119826 0.142630
7 Fencing 880 3243 475 119.0 0.135227 0.146469
8 Figure Skating 824 748 66 65.0 0.078883 0.088235
9 Golf 70 148 13 2.0 0.028571 0.087838
10 Gymnastics 1499 2635 557 234.0 0.156104 0.211385
11 Luge 228 544 47 15.0 0.065789 0.086397
12 Modern Pentathlon 114 750 57 5.0 0.043860 0.076000
13 Motorboating 1 13 7 0.0 0.000000 0.538462
14 Rugby Sevens 148 151 13 12.0 0.081081 0.086093
15 Sailing 629 3851 400 47.0 0.074722 0.103869
16 Shooting 737 4145 361 49.0 0.066486 0.087093
17 Ski Jumping 30 844 68 1.0 0.033333 0.080569
18 Table Tennis 377 372 27 27.0 0.071618 0.072581
19 Wrestling 222 4766 395 18.0 0.081081 0.082879
In [261]:
# sports where male atletes have less medals than female athletes 
display(pysqldf(""" SELECT *
                    FROM sportgender_gold_df
                    WHERE male_avggoldmedal < female_avggoldmedal; """))
Sport female_ath_count male_ath_count male_goldmedal_cnt female_goldmedal_cnt female_avggoldmedal male_avggoldmedal
0 Alpine Skiing 996 1739 71 72.0 0.072289 0.040828
1 Athletics 6529 15542 911 428.0 0.065554 0.058615
2 Basketball 932 2481 234 131.0 0.140558 0.094317
3 Beach Volleyball 189 194 12 12.0 0.063492 0.061856
4 Biathlon 371 764 86 50.0 0.134771 0.112565
5 Boxing 65 5197 246 6.0 0.092308 0.047335
6 Canoeing 702 2504 294 94.0 0.133903 0.117412
7 Cross Country Skiing 717 1683 143 113.0 0.157601 0.084967
8 Cycling 714 5105 366 58.0 0.081232 0.071694
9 Diving 635 831 75 67.0 0.105512 0.090253
10 Football 734 5427 414 101.0 0.137602 0.076285
11 Freestyle Skiing 267 359 17 17.0 0.063670 0.047354
12 Handball 1027 1675 194 155.0 0.150925 0.115821
13 Hockey 996 2829 360 158.0 0.158635 0.127253
14 Ice Hockey 498 3386 407 101.0 0.202811 0.120201
15 Judo 757 1967 88 49.0 0.064729 0.044738
16 Rowing 1483 6204 740 238.0 0.160486 0.119278
17 Short Track Speed Skating 209 235 48 47.0 0.224880 0.204255
18 Skeleton 45 101 6 4.0 0.088889 0.059406
19 Snowboarding 239 328 15 15.0 0.062762 0.045732
20 Speed Skating 528 1054 112 81.0 0.153409 0.106262
21 Swimming 3621 5144 606 493.0 0.136150 0.117807
22 Taekwondo 229 241 20 20.0 0.087336 0.082988
23 Tennis 486 760 62 44.0 0.090535 0.081579
24 Trampolining 44 49 5 5.0 0.113636 0.102041
25 Triathlon 175 180 5 5.0 0.028571 0.027778
26 Volleyball 1129 1374 166 156.0 0.138175 0.120815
27 Water Polo 337 2262 287 63.0 0.186944 0.126879
28 Weightlifting 356 2526 182 35.0 0.098315 0.072051

Other questions¶

Do male athletes acquire more gold medals than female in mixed sport events ?¶
In [186]:
# number of male and female athletes in each event in 2016 olympics 
display(pysqldf(""" 
                    SELECT *
                    FROM(
                        SELECT Event, 
                               COUNT(CASE WHEN Sex = 'F' THEN 1 END) AS female_cnt, 
                               COUNT(CASE WHEN Sex = 'M' THEN 1 END) AS male_cnt

                        FROM(
                            SELECT DISTINCT ID, Name, Sex, Event, Sport, Medal
                            FROM olympics_games_distinct
                            WHERE Games = '2016 Summer') 
                        GROUP BY Event 
                    )
                    WHERE (female_cnt > 0) AND (male_cnt > 0)
                """))
Event female_cnt male_cnt
0 Badminton Mixed Doubles 16 16
1 Equestrianism Mixed Dressage, Individual 39 21
2 Equestrianism Mixed Dressage, Team 25 19
3 Equestrianism Mixed Jumping, Individual 13 62
4 Equestrianism Mixed Jumping, Team 11 49
5 Equestrianism Mixed Three-Day Event, Individual 23 42
6 Equestrianism Mixed Three-Day Event, Team 20 31
7 Sailing Mixed Multihull 20 20
8 Tennis Mixed Doubles 15 15

Extra - Using SQL window functions¶

In [120]:
top_ath_df = pysqldf("""SELECT Region, NOC, ID, Name, COUNT(Medal)  AS medals 
                        FROM olympics_games_distinct
                        GROUP BY Region, NOC, ID, Name
                         """)
top_ath_df['Region'] = top_ath_df['Region'].astype(pd.StringDtype())
top_ath_df['Name'] = top_ath_df['Name'].astype(pd.StringDtype())
top_ath_df
Out[120]:
Region NOC ID Name medals
0 <NA> ROT 3515 Paulo Amotun Lokoro 0
1 <NA> ROT 4379 Rami Anis 0
2 <NA> ROT 11364 Yiech Pur Biel 0
3 <NA> ROT 16287 Mabika Yolande Bukasa 0
4 <NA> ROT 20693 James Nyang Chiengjiek 0
... ... ... ... ... ...
137238 Zimbabwe ZIM 130453 Antonette Wilken (-Batchelor) 0
137239 Zimbabwe ZIM 130832 Hillary Wilson 0
137240 Zimbabwe ZIM 130880 Peter Arthur Wilson 0
137241 Zimbabwe ZIM 131478 Jennifer "Jenny" Wood 0
137242 Zimbabwe ZIM 135497 Lloyd Zvasiya 0

137243 rows × 5 columns

In [121]:
# Rank of athletes per region based on total number of medals 
df = pysqldf(""" SELECT Region, NOC, Name, medals,  
                        DENSE_RANK() OVER(PARTITION BY Region ORDER BY medals DESC) AS regional_rank 
                    FROM top_ath_df
                    --WHERE Region = 'Zimbabwe'           
                    """)
df               
Out[121]:
Region NOC Name medals regional_rank
0 None ROT Paulo Amotun Lokoro 0 1
1 None ROT Rami Anis 0 1
2 None ROT Yiech Pur Biel 0 1
3 None ROT Mabika Yolande Bukasa 0 1
4 None ROT James Nyang Chiengjiek 0 1
... ... ... ... ... ...
137238 Zimbabwe ZIM Antonette Wilken (-Batchelor) 0 3
137239 Zimbabwe ZIM Hillary Wilson 0 3
137240 Zimbabwe ZIM Peter Arthur Wilson 0 3
137241 Zimbabwe ZIM Jennifer "Jenny" Wood 0 3
137242 Zimbabwe ZIM Lloyd Zvasiya 0 3

137243 rows × 5 columns

In [122]:
# Rank of top athletes within their region 
ath_regionrank_df = pysqldf("""
                                SELECT Region, Name, medals, regional_rank 
                                FROM df
                                WHERE medals > 0 AND regional_rank = 1
                                ORDER BY medals DESC
                            """)
ath_regionrank_df
Out[122]:
Region Name medals regional_rank
0 United States Michael Fred Phelps, II 28 1
1 Russia Larysa Semenivna Latynina (Diriy-) 18 1
2 Italy Edoardo Mangiarotti 13 1
3 Japan Takashi Ono 13 1
4 Norway Ole Einar Bjrndalen 13 1
... ... ... ... ...
359 Venezuela Israel Jos Rubio Rivero 1 1
360 Venezuela Rafael Antonio Vidal Castro 1 1
361 Virgin Islands, US Peter William Holmberg 1 1
362 Zambia Samuel Matete 1 1
363 Zambia Keith Mwila 1 1

364 rows × 4 columns

In [123]:
top_ath_df1 = pysqldf("""SELECT Games, ID, Name, COUNT(Medal)  AS medals 
                        FROM olympics_games_distinct
                        GROUP BY Games, ID, Name
                         """)
top_ath_df1['Games'] = top_ath_df1['Games'].astype(pd.StringDtype())
top_ath_df1['Name'] = top_ath_df1['Name'].astype(pd.StringDtype())
top_ath_df1
Out[123]:
Games ID Name medals
0 1896 Summer 1724 Aristidis Akratopoulos 0
1 1896 Summer 1725 Konstantinos "Kostas" Akratopoulos 0
2 1896 Summer 4113 Anastasios Andreou 0
3 1896 Summer 4116 Ioannis Andreou 1
4 1896 Summer 4189 Nikolaos Andriakopoulos 1
... ... ... ... ...
187447 2016 Summer 135489 Anastasiya Valeryevna Zuyeva-Fesikova 0
187448 2016 Summer 135525 Martin Zwicker 1
187449 2016 Summer 135528 Marc Zwiebler 0
187450 2016 Summer 135547 Viktoriya Viktorovna Zyabkina 0
187451 2016 Summer 135568 Olga Igorevna Zyuzkova 0

187452 rows × 4 columns

In [124]:
# Rank of athletes per olympic game based on total number of medals 
df1 = pysqldf(""" SELECT Games, Name, medals,  
                        DENSE_RANK() OVER(PARTITION BY Games ORDER BY medals DESC) AS game_rank 
                    FROM top_ath_df1
                    --WHERE Region = 'Zimbabwe'           
                    """)
df1         
Out[124]:
Games Name medals game_rank
0 1896 Summer Hermann Otto Ludwig Weingrtner 6 1
1 1896 Summer Alfred Flatow 4 2
2 1896 Summer Robert "Bob" Garrett 4 2
3 1896 Summer Carl Schuhmann 4 2
4 1896 Summer James Brendan Bennet Connolly 3 3
... ... ... ... ...
187447 2016 Summer Henrikas ustautas 0 7
187448 2016 Summer Anastasiya Valeryevna Zuyeva-Fesikova 0 7
187449 2016 Summer Marc Zwiebler 0 7
187450 2016 Summer Viktoriya Viktorovna Zyabkina 0 7
187451 2016 Summer Olga Igorevna Zyuzkova 0 7

187452 rows × 4 columns

In [125]:
# Top athletes in each game
ath_gamerank_df = pysqldf("""
                                SELECT Games, Name, medals, game_rank 
                                FROM df1
                                WHERE medals > 0 AND game_rank = 1
                                ORDER BY medals DESC
                            """)
ath_gamerank_df
Out[125]:
Games Name medals game_rank
0 1980 Summer Aleksandr Nikolayevich Dityatin 8 1
1 2004 Summer Michael Fred Phelps, II 8 1
2 2008 Summer Michael Fred Phelps, II 8 1
3 1920 Summer Willis Augustus Lee, Jr. 7 1
4 1920 Summer Lloyd Spencer Spooner 7 1
5 1952 Summer Mariya Kindrativna Horokhovska 7 1
6 1960 Summer Borys Anfiyanovych Shakhlin 7 1
7 1968 Summer Mikhail Yakovlevich Voronin 7 1
8 1972 Summer Mark Andrew Spitz 7 1
9 1976 Summer Nikolay Yefimovich Andrianov 7 1
10 1988 Summer Matthew Nicholas "Matt" Biondi 7 1
11 1896 Summer Hermann Otto Ludwig Weingrtner 6 1
12 1904 Summer Burton Cecil Downing 6 1
13 1904 Summer George Louis Eyser 6 1
14 1904 Summer Anton Heida 6 1
15 1924 Summer Viljo Eino "Ville" Ritola (Koukkari-) 6 1
16 1936 Summer Konrad Frey 6 1
17 1956 Summer gnes Keleti-Srkny (Klein) 6 1
18 1956 Summer Larysa Semenivna Latynina (Diriy-) 6 1
19 1964 Summer Larysa Semenivna Latynina (Diriy-) 6 1
20 1984 Summer Li Ning 6 1
21 1992 Summer Vitaly Venediktovich Shcherbo 6 1
22 1996 Summer Aleksey Yuryevich Nemov 6 1
23 2000 Summer Aleksey Yuryevich Nemov 6 1
24 2012 Summer Michael Fred Phelps, II 6 1
25 2016 Summer Michael Fred Phelps, II 6 1
26 1900 Summer Irving Knott "Irv" Baxter 5 1
27 1900 Summer Walter Beardsley Tewksbury 5 1
28 1906 Summer Lon Ernest Moreaux 5 1
29 1906 Summer Martin Joseph Sheridan 5 1
30 1912 Summer Gustaf Vilhelm Carlberg 5 1
31 1924 Winter Roald Morel Larsen 5 1
32 1924 Winter Arnold Clas Robert Thunberg 5 1
33 1948 Summer Veikko Aarne Aleks Huhtanen 5 1
34 1980 Winter Eric Arthur Heiden 5 1
35 1992 Winter Yelena Valeryevna Vlbe (Trubitsyna-) 5 1
36 1992 Winter Lyubov Ivanovna Yegorova 5 1
37 1994 Winter Manuela Di Centa 5 1
38 1998 Winter Larisa Yevgenyevna Lazutina (Ptitsyna-) 5 1
39 2006 Winter Cynthia Nicole "Cindy" Klassen 5 1
40 2010 Winter Marit Bjrgen 5 1
41 2014 Winter Irene Karlijn "Ireen" Wst 5 1
42 1928 Summer Hermann Hnggi 4 1
43 1928 Summer Georg "Georges" Miez 4 1
44 1932 Summer Giulio Gaudini 4 1
45 1932 Summer Istvn Pelle 4 1
46 1932 Summer Heikki Ilmari Savolainen 4 1
47 1936 Winter Ivar Eugen Ballangrud (Eriksen-) 4 1
48 1956 Winter Edy Sixten Jernberg 4 1
49 1964 Winter Lidiya Pavlovna Skoblikova (-Polozkova) 4 1
50 1976 Winter Tatyana Borisovna Averina-Barabash 4 1
51 1984 Winter Karin Enke-Kania (-Busch-, -Richter) 4 1
52 1984 Winter Marja-Liisa Kirvesniemi-Hmlinen 4 1
53 1984 Winter Gunde Anders Svan 4 1
54 2002 Winter Ole Einar Bjrndalen 4 1
55 2002 Winter Janica Kosteli 4 1
56 1908 Summer Benjamin "Ben" Jones 3 1
57 1908 Summer Thomas "Ted" Ranken 3 1
58 1908 Summer Major Josiah George Ritchie 3 1
59 1908 Summer Melvin Winfield "Mel" Sheppard 3 1
60 1908 Summer Martin Joseph Sheridan 3 1
61 1908 Summer Oscar Gomer Swahn 3 1
62 1908 Summer Henry Taylor 3 1
63 1928 Winter Bernt Sverre Evensen 3 1
64 1948 Winter Henri Jean Oreiller 3 1
65 1952 Winter Hjalmar Johan Andersen 3 1
66 1952 Winter Annemarie "Mirl" Buchner (Fischer-) 3 1
67 1960 Winter Veikko Johannes Hakulinen 3 1
68 1968 Winter Toini Gustafsson (Karvonen-, -Rnnlund) 3 1
69 1968 Winter Jean-Claude Killy 3 1
70 1968 Winter Eero Antero Mntyranta 3 1
71 1972 Winter Ritva Marjatta Kajosmaa (Sakki-) 3 1
72 1972 Winter Atje Keulen-Deelstra 3 1
73 1972 Winter Galina Alekseyevna Kulakova 3 1
74 1972 Winter Adrie "Ard" Schenk 3 1
75 1972 Winter Pl Bjarne Tyldum 3 1
76 1972 Winter Vyacheslav Petrovich Vedenin 3 1
77 1988 Winter Andrea Ehrig-Schne-Mitscherlich 3 1
78 1988 Winter Karin Enke-Kania (-Busch-, -Richter) 3 1
79 1988 Winter Marjo Tuulevi Matikainen (-Kallstrm) 3 1
80 1988 Winter Valery Alekseyevich Medvedtsev 3 1
81 1988 Winter Matti Ensio Nyknen (-Paanala) 3 1
82 1988 Winter Vladimir Mikhaylovich Smirnov 3 1
83 1988 Winter Tamara Ivanovna Tikhonova 3 1
84 1988 Winter Yvonne Maria Therse van Gennip 3 1
85 1932 Winter Alexander Brengle "Alex" Hurd 2 1
86 1932 Winter Irving Warren Jaffee 2 1
87 1932 Winter William Frederick "Willy" Logan 2 1
88 1932 Winter Veli Selim Saarinen 2 1
89 1932 Winter John Amos "Jack" Shea 2 1

Visualizations¶

Hypothesis 1: host teams perform better than visiting teams¶

In [126]:
medalavg_awayhome_df
Out[126]:
Games home_team_cnt away_team_cnt home_medal_count away_medal_count home_avgmedal away_avgmedal
0 1896 Summer 5 13 48 95 9.600000 7.307692
1 1900 Summer 122 77 225 368 1.844262 4.779221
2 1904 Summer 59 27 394 92 6.677966 3.407407
3 1906 Summer 21 33 102 356 4.857143 10.787879
4 1908 Summer 34 39 368 463 10.823529 11.871795
5 1912 Summer 17 84 190 751 11.176471 8.940476
6 1920 Summer 9 63 188 1120 20.888889 17.777778
7 1924 Summer 5 85 110 722 22.000000 8.494118
8 1924 Winter 3 25 10 120 3.333333 4.800000
9 1928 Summer 3 64 57 677 19.000000 10.578125
10 1928 Winter 3 38 12 77 4.000000 2.026316
11 1932 Summer 4 55 189 458 47.250000 8.327273
12 1932 Winter 3 26 34 58 11.333333 2.230769
13 1936 Summer 5 100 224 693 44.800000 6.930000
14 1936 Winter 3 51 7 101 2.333333 1.980392
15 1948 Summer 5 106 61 791 12.200000 7.462264
16 1948 Winter 3 43 28 107 9.333333 2.488372
17 1952 Summer 5 128 40 857 8.000000 6.695312
18 1952 Winter 3 49 19 117 6.333333 2.387755
19 1956 Summer 6 118 72 821 12.000000 6.957627
20 1956 Winter 3 53 8 142 2.666667 2.679245
21 1960 Summer 5 181 88 823 17.600000 4.546961
22 1960 Winter 4 36 27 120 6.750000 3.333333
23 1964 Summer 5 163 62 967 12.400000 5.932515
24 1964 Winter 3 63 17 169 5.666667 2.682540
25 1968 Summer 1 111 9 1048 9.000000 9.441441
26 1968 Winter 3 67 9 190 3.000000 2.835821
27 1972 Summer 8 131 253 962 31.625000 7.343511
28 1972 Winter 3 60 3 196 1.000000 3.266667
29 1976 Summer 1 91 23 1297 23.000000 14.252747
30 1976 Winter 3 67 7 204 2.333333 3.044776
31 1980 Summer 1 79 442 942 442.000000 11.924051
32 1980 Winter 3 55 30 188 10.000000 3.418182
33 1984 Summer 1 139 352 1124 352.000000 8.086331
34 1984 Winter 3 77 1 221 0.333333 2.870130
35 1988 Summer 3 174 77 1505 25.666667 8.649425
36 1988 Winter 4 94 6 257 1.500000 2.734043
37 1992 Summer 1 211 69 1643 69.000000 7.786730
38 1992 Winter 4 107 12 306 3.000000 2.859813
39 1994 Winter 1 100 30 301 30.000000 3.010000
40 1996 Summer 4 242 259 1583 64.750000 6.541322
41 1998 Winter 3 103 13 427 4.333333 4.145631
42 2000 Summer 4 239 183 1821 45.750000 7.619247
43 2002 Winter 3 111 84 394 28.000000 3.549550
44 2004 Summer 3 257 31 1970 10.333333 7.665370
45 2006 Winter 3 110 25 501 8.333333 4.554545
46 2008 Summer 5 287 184 1864 36.800000 6.494774
47 2010 Winter 3 113 90 430 30.000000 3.805310
48 2012 Summer 3 242 126 1815 42.000000 7.500000
49 2014 Winter 4 115 68 529 17.000000 4.600000
50 2016 Summer 3 245 50 1973 16.666667 8.053061
In [127]:
# total number of medals for each team type 
n_games = len(medalavg_awayhome_df['Games'])
home_medal = medalavg_awayhome_df['home_medal_count']
away_medal = medalavg_awayhome_df['away_medal_count']
homeaway_comb = np.append(home_medal,away_medal)

df = pd.DataFrame(dict(game=medalavg_awayhome_df['Games'].tolist()*2, medal=homeaway_comb.tolist(),
                       team=["Home"]*n_games + ["Away"]*n_games))

fig = px.scatter(df, x="medal", y="game", color="team",
                 title="Total medals per game",
                 labels={"medal":"Total medals"} # customize axis label
                )

fig.show()
In [128]:
# avg number of medals for each team type 
n_games = len(medalavg_awayhome_df['Games'])
home_medal = medalavg_awayhome_df['home_avgmedal']
away_medal = medalavg_awayhome_df['away_avgmedal']
homeaway_comb = np.append(home_medal,away_medal)

df = pd.DataFrame(dict(game=medalavg_awayhome_df['Games'].tolist()*2, medal=homeaway_comb.tolist(),
                       team=["Home"]*n_games + ["Away"]*n_games))

fig = px.scatter(df, x="medal", y="game", color="team",
                 title="Average team medals per game",
                 labels={"medal":"Average medals"} # customize axis label
                )

fig.show()
In [129]:
# avg number of medals for each team type 

fig = go.Figure()
fig.add_trace(go.Bar(
    y=medalavg_awayhome_df['Games'].tolist(),
    x=medalavg_awayhome_df['home_avgmedal'].tolist(),
    name='Home teams',
    orientation='h',
    marker=dict(
        color='cornflowerblue',
        line=dict(color='cornflowerblue', width=1)
    )
))
fig.add_trace(go.Bar(
    y=medalavg_awayhome_df['Games'].tolist(),
    x=medalavg_awayhome_df['away_avgmedal'].tolist(),
    name='Away teams',
    orientation='h',
    marker=dict(
        color='#EF553B',
        line=dict(color='#EF553B', width=1)
    )
))

fig.update_layout(barmode='stack', title="Average team medals per game")
fig.show()
In [218]:
# avg number of gold medals for each team type 

fig = go.Figure()
fig.add_trace(go.Bar(
    y=goldcnt_awayhome_df['Games'].tolist(),
    x=goldcnt_awayhome_df['home_avggoldmedal'].tolist(),
    name='Home teams',
    orientation='h',
    marker=dict(
        color='cornflowerblue',
        line=dict(color='cornflowerblue', width=1)
    )
))
fig.add_trace(go.Bar(
    y=goldcnt_awayhome_df['Games'].tolist(),
    x=goldcnt_awayhome_df['away_avggoldmedal'].tolist(),
    name='Away teams',
    orientation='h',
    marker=dict(
        color='#EF553B',
        line=dict(color='#EF553B', width=1)
    )
))

fig.update_layout(barmode='stack', title="Average team gold medals per game")
fig.show()

Hypothesis 2: age, height, and weight influence performance¶

In [130]:
athlete_medals_df
Out[130]:
ID Name Sex avg_age avg_height avg_weight total_medal
0 94406 Michael Fred Phelps, II M 24.333333 193.00000 91.00000 28
1 67046 Larysa Semenivna Latynina (Diriy-) F 24.600000 161.00000 52.00000 18
2 4198 Nikolay Yefimovich Andrianov M 23.000000 166.00000 60.00000 15
3 11951 Ole Einar Bjrndalen M 30.857143 178.00000 65.00000 13
4 74420 Edoardo Mangiarotti M 33.400000 176.31541 71.94132 13
... ... ... ... ... ... ... ...
135566 135567 Aleksandr Viktorovich Zyuzin M 26.000000 183.00000 72.00000 0
135567 135568 Olga Igorevna Zyuzkova F 33.000000 171.00000 69.00000 0
135568 135569 Andrzej ya M 29.000000 179.00000 89.00000 0
135569 135570 Piotr ya M 27.000000 176.00000 59.00000 0
135570 135571 Tomasz Ireneusz ya M 32.000000 185.00000 96.00000 0

135571 rows × 7 columns

In [131]:
import plotly.express as px
fig = px.scatter_matrix(athlete_medals_df,
    dimensions=["avg_age", "avg_height", "avg_weight", "total_medal"],
    color="Sex",opacity=0.5)
fig.show()
In [153]:
for col in athlete_medals_df[["avg_age", "avg_height", "avg_weight"]]: 
    fig = px.scatter(athlete_medals_df, x=athlete_medals_df[col], y="total_medal",
                 size="total_medal", color="Sex",
                 hover_name="Name", trendline="ols")
    #fig.update_layeout()
    fig.show()
In [225]:
fig = px.scatter_matrix(athletes_gold_df,
    dimensions=["avg_age", "avg_height", "avg_weight", "total_goldmedal"],
    color="Sex",opacity=0.5)
fig.show()

Hypothesis 3: Male perform better than female in some sports (vice versa)¶

In [135]:
sportgendermedal_df
Out[135]:
Sport female_ath_count male_ath_count female_medal_cnt male_medal_cnt female_avgmedal male_avgmedal
0 Alpine Skiing 996 1739 213 215 0.213855 0.123634
1 Alpinism 1 24 1 24 1.000000 1.000000
2 Archery 500 613 121 232 0.242000 0.378467
3 Art Competitions 204 1610 11 145 0.053922 0.090062
4 Athletics 6529 15542 1275 2694 0.195283 0.173337
5 Badminton 412 399 84 84 0.203883 0.210526
6 Basketball 932 2481 393 687 0.421674 0.276904
7 Beach Volleyball 189 194 36 36 0.190476 0.185567
8 Biathlon 371 764 150 258 0.404313 0.337696
9 Bobsleigh 109 1585 24 374 0.220183 0.235962
10 Boxing 65 5197 24 920 0.369231 0.177025
11 Canoeing 702 2504 282 883 0.401709 0.352636
12 Croquet 3 7 0 8 0.000000 1.142857
13 Cross Country Skiing 717 1683 339 437 0.472803 0.259655
14 Curling 160 186 68 84 0.425000 0.451613
15 Cycling 714 5105 176 1087 0.246499 0.212929
16 Diving 635 831 201 226 0.316535 0.271961
17 Equestrianism 459 1886 207 758 0.450980 0.401909
18 Fencing 880 3243 349 1394 0.396591 0.429849
19 Figure Skating 824 748 191 195 0.231796 0.260695
20 Football 734 5427 302 1269 0.411444 0.233831
21 Freestyle Skiing 267 359 51 51 0.191011 0.142061
22 Golf 70 148 6 40 0.085714 0.270270
23 Gymnastics 1499 2635 701 1555 0.467645 0.590133
24 Handball 1027 1675 472 588 0.459591 0.351045
25 Hockey 996 2829 478 1050 0.479920 0.371156
26 Ice Hockey 498 3386 300 1230 0.602410 0.363260
27 Judo 757 1967 196 351 0.258917 0.178444
28 Luge 228 544 45 135 0.197368 0.248162
29 Modern Pentathlon 114 750 15 171 0.131579 0.228000
30 Motorboating 1 13 0 7 0.000000 0.538462
31 Rowing 1483 6204 720 2225 0.485502 0.358640
32 Rugby Sevens 148 151 36 38 0.243243 0.251656
33 Sailing 629 3851 134 1087 0.213037 0.282264
34 Shooting 737 4145 146 1082 0.198100 0.261037
35 Short Track Speed Skating 209 235 140 144 0.669856 0.612766
36 Skeleton 45 101 12 18 0.266667 0.178218
37 Ski Jumping 30 844 3 204 0.100000 0.241706
38 Snowboarding 239 328 45 45 0.188285 0.137195
39 Speed Skating 528 1054 242 338 0.458333 0.320683
40 Swimming 3621 5144 1374 1674 0.379453 0.325428
41 Table Tennis 377 372 84 84 0.222812 0.225806
42 Taekwondo 229 241 72 72 0.314410 0.298755
43 Tennis 486 760 140 200 0.288066 0.263158
44 Trampolining 44 49 15 15 0.340909 0.306122
45 Triathlon 175 180 15 15 0.085714 0.083333
46 Volleyball 1129 1374 474 495 0.419841 0.360262
47 Water Polo 337 2262 191 866 0.566766 0.382847
48 Weightlifting 356 2526 105 541 0.294944 0.214173
49 Wrestling 222 4766 68 1228 0.306306 0.257658
In [219]:
# avg number of medals for each gender in each sport 

fig = go.Figure()
fig.add_trace(go.Bar(
    y=sportgendermedal_df['Sport'].tolist(),
    x=sportgendermedal_df['male_avgmedal'].tolist(),
    name='Male',
    orientation='h',
    marker=dict(
        color='cornflowerblue',
        line=dict(color='cornflowerblue', width=1)
    )
))
fig.add_trace(go.Bar(
    y=sportgendermedal_df['Sport'].tolist(),
    x=sportgendermedal_df['female_avgmedal'].tolist(),
    name='Female',
    orientation='h',
    marker=dict(
        color='#EF553B',
        line=dict(color='#EF553B', width=1)
    )
))

fig.update_layout(barmode='stack', title="Average medals per sport")
fig.show()
In [159]:
# avg number of medals for each gender in each sport 
n_sport = len(sportgendermedal_df['Sport'])
f_avg= sportgendermedal_df['female_avgmedal']
m_avg = sportgendermedal_df['male_avgmedal']
fm_comb = np.append(m_avg,f_avg)

df = pd.DataFrame(dict(sport=sportgendermedal_df['Sport'].tolist()*2, medal=fm_comb.tolist(),
                       Gender= ["Male"]*n_sport + ["Female"]*n_sport))

fig = px.scatter(df, x="medal", y="sport", color="Gender",
                 title="Average gender medals per sport",
                 labels={"medal":"Average medals"} # customize axis label
                )

fig.show()
In [234]:
sportgender_gold_df
Out[234]:
Sport female_ath_count male_ath_count male_goldmedal_cnt female_goldmedal_cnt female_avggoldmedal male_avggoldmedal
0 Alpine Skiing 996 1739 71 72.0 0.072289 0.040828
1 Alpinism 1 24 24 1.0 1.000000 1.000000
2 Archery 500 613 92 43.0 0.086000 0.150082
3 Art Competitions 204 1610 48 1.0 0.004902 0.029814
4 Athletics 6529 15542 911 428.0 0.065554 0.058615
5 Badminton 412 399 27 27.0 0.065534 0.067669
6 Basketball 932 2481 234 131.0 0.140558 0.094317
7 Beach Volleyball 189 194 12 12.0 0.063492 0.061856
8 Biathlon 371 764 86 50.0 0.134771 0.112565
9 Bobsleigh 109 1585 125 8.0 0.073394 0.078864
10 Boxing 65 5197 246 6.0 0.092308 0.047335
11 Canoeing 702 2504 294 94.0 0.133903 0.117412
12 Croquet 3 7 4 0.0 0.000000 0.571429
13 Cross Country Skiing 717 1683 143 113.0 0.157601 0.084967
14 Curling 160 186 28 22.0 0.137500 0.150538
15 Cycling 714 5105 366 58.0 0.081232 0.071694
16 Diving 635 831 75 67.0 0.105512 0.090253
17 Equestrianism 459 1886 269 55.0 0.119826 0.142630
18 Fencing 880 3243 475 119.0 0.135227 0.146469
19 Figure Skating 824 748 66 65.0 0.078883 0.088235
20 Football 734 5427 414 101.0 0.137602 0.076285
21 Freestyle Skiing 267 359 17 17.0 0.063670 0.047354
22 Golf 70 148 13 2.0 0.028571 0.087838
23 Gymnastics 1499 2635 557 234.0 0.156104 0.211385
24 Handball 1027 1675 194 155.0 0.150925 0.115821
25 Hockey 996 2829 360 158.0 0.158635 0.127253
26 Ice Hockey 498 3386 407 101.0 0.202811 0.120201
27 Judo 757 1967 88 49.0 0.064729 0.044738
28 Luge 228 544 47 15.0 0.065789 0.086397
29 Modern Pentathlon 114 750 57 5.0 0.043860 0.076000
30 Motorboating 1 13 7 0.0 0.000000 0.538462
31 Rowing 1483 6204 740 238.0 0.160486 0.119278
32 Rugby Sevens 148 151 13 12.0 0.081081 0.086093
33 Sailing 629 3851 400 47.0 0.074722 0.103869
34 Shooting 737 4145 361 49.0 0.066486 0.087093
35 Short Track Speed Skating 209 235 48 47.0 0.224880 0.204255
36 Skeleton 45 101 6 4.0 0.088889 0.059406
37 Ski Jumping 30 844 68 1.0 0.033333 0.080569
38 Snowboarding 239 328 15 15.0 0.062762 0.045732
39 Speed Skating 528 1054 112 81.0 0.153409 0.106262
40 Swimming 3621 5144 606 493.0 0.136150 0.117807
41 Table Tennis 377 372 27 27.0 0.071618 0.072581
42 Taekwondo 229 241 20 20.0 0.087336 0.082988
43 Tennis 486 760 62 44.0 0.090535 0.081579
44 Trampolining 44 49 5 5.0 0.113636 0.102041
45 Triathlon 175 180 5 5.0 0.028571 0.027778
46 Volleyball 1129 1374 166 156.0 0.138175 0.120815
47 Water Polo 337 2262 287 63.0 0.186944 0.126879
48 Weightlifting 356 2526 182 35.0 0.098315 0.072051
49 Wrestling 222 4766 395 18.0 0.081081 0.082879
In [236]:
# avg number of gold medals for each gender in each sport 
fig = go.Figure()
fig.add_trace(go.Bar(
    y=sportgender_gold_df['Sport'].tolist(),
    x=sportgender_gold_df['male_avggoldmedal'].tolist(),
    name='Male',
    orientation='h',
    marker=dict(
        color='cornflowerblue',
        line=dict(color='cornflowerblue', width=1)
    )
))
fig.add_trace(go.Bar(
    y=sportgender_gold_df['Sport'].tolist(),
    x=sportgender_gold_df['female_avggoldmedal'].tolist(),
    name='Female',
    orientation='h',
    marker=dict(
        color='#EF553B',
        line=dict(color='#EF553B', width=1)
    )
))

fig.update_layout(barmode='stack', title="Average gold medals per sport")
fig.show()